Actual source code: veccusp.cu

petsc-3.3-p5 2012-12-01
  1: /*
  2:    Implements the sequential cusp vectors.
  3: */

  5: #include <petscconf.h>
  6: PETSC_CUDA_EXTERN_C_BEGIN
  7: #include <petsc-private/vecimpl.h>          /*I "petscvec.h" I*/
  8: #include <../src/vec/vec/impls/dvecimpl.h>
  9: PETSC_CUDA_EXTERN_C_END
 10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>


 15: /*
 16:     Allocates space for the vector array on the Host if it does not exist.
 17:     Does NOT change the PetscCUSPFlag for the vector
 18:     Does NOT zero the CUSP array 
 19:  */
 20: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
 21: {
 23:   PetscScalar    *array;
 24:   Vec_Seq        *s;
 25:   PetscInt       n = v->map->n;
 27:   s = (Vec_Seq*)v->data;
 28:   if (s->array == 0) {
 29:     //#ifdef PETSC_HAVE_TXPETSCGPU
 30:     //if (n>0)
 31:     // cudaMallocHost((void **) &array, n*sizeof(PetscScalar));CHKERRCUSP(ierr);
 32:     //#else
 33:     PetscMalloc(n*sizeof(PetscScalar),&array);
 34:     PetscLogObjectMemory(v,n*sizeof(PetscScalar));
 35:     s->array           = array;
 36:     s->array_allocated = array;
 37:   }
 38:   return(0);
 39: }


 44: /*
 45:     Allocates space for the vector array on the GPU if it does not exist.
 46:     Does NOT change the PetscCUSPFlag for the vector
 47:     Does NOT zero the CUSP array
 48:  
 49:  */
 50: PetscErrorCode VecCUSPAllocateCheck(Vec v)
 51: {
 53:   int rank;
 54:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

 57:   // First allocate memory on the GPU if needed
 58:   if (!v->spptr) {
 59:     try {
 60:       v->spptr = new Vec_CUSP;
 61:       ((Vec_CUSP*)v->spptr)->GPUarray = new CUSPARRAY;
 62:       ((Vec_CUSP*)v->spptr)->GPUarray->resize((PetscBLASInt)v->map->n);

 64: #ifdef PETSC_HAVE_TXPETSCGPU
 66:       ((Vec_CUSP*)v->spptr)->GPUvector = new GPU_Vector<PetscInt, PetscScalar>(((Vec_CUSP*)v->spptr)->GPUarray, rank);
 67:       ((Vec_CUSP*)v->spptr)->GPUvector->buildStreamsAndEvents();CHKERRCUSP(ierr);

 69:       Vec_Seq        *s;
 70:       s = (Vec_Seq*)v->data;
 71:       if (v->map->n>0) {
 72:         if (s->array==0) {
 73:           // In this branch, GPUvector owns the ptr and manages the memory
 74:           ((Vec_CUSP*)v->spptr)->GPUvector->allocateHostMemory();CHKERRCUSP(ierr);
 75:           s->array           = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
 76:           s->array_allocated = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();
 77:         }
 78:         else {
 79:           // In this branch, Petsc owns the ptr to start, however we want to use
 80:           // page locked host memory for faster data transfers. So, a new
 81:           // page-locked buffer is allocated. Then, the old Petsc memory
 82:           // is copied in to the new buffer. Then the old Petsc memory is freed.
 83:           // GPUvector owns the new ptr.
 84:           ((Vec_CUSP*)v->spptr)->GPUvector->allocateHostMemory();CHKERRCUSP(ierr);
 85:           PetscScalar * temp = ((Vec_CUSP*)v->spptr)->GPUvector->getHostMemoryPtr();

 87:           PetscMemcpy(temp,s->array,v->map->n*sizeof(PetscScalar));
 88:           PetscFree(s->array);
 89:           s->array           = temp;
 90:           s->array_allocated = temp;
 91:         }
 92:         WaitForGPU();CHKERRCUSP(ierr);
 93:       }
 94:       v->ops->destroy = VecDestroy_SeqCUSP;
 95: #endif
 96:     } catch(char* ex) {
 97:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
 98:     }
 99:   }

101:   return(0);
102: }


107: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
108: PetscErrorCode VecCUSPCopyToGPU(Vec v)
109: {
112:   VecCUSPAllocateCheck(v);
113:   if (v->valid_GPU_array == PETSC_CUSP_CPU){
114:     PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
115:     try{
116: #ifdef PETSC_HAVE_TXPETSCGPU
117:       ((Vec_CUSP*)v->spptr)->GPUvector->copyToGPUAll();CHKERRCUSP(ierr);
118: #else 
119:       CUSPARRAY      *varray;
120:       varray  = ((Vec_CUSP*)v->spptr)->GPUarray;
121:       varray->assign(*(PetscScalar**)v->data,*(PetscScalar**)v->data + v->map->n);
122:       WaitForGPU();CHKERRCUSP(ierr);
123: #endif

125:     } catch(char* ex) {
126:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
127:     }
128:     PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
129:     v->valid_GPU_array = PETSC_CUSP_BOTH;
130:   }
131:   return(0);
132: }

136: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
137: {
139:   CUSPARRAY      *varray;
140: 
142:   VecCUSPAllocateCheck(v);
143:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
144:     PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
145:     varray  = ((Vec_CUSP*)v->spptr)->GPUarray;
146: #ifdef PETSC_HAVE_TXPETSCGPU
147:     ((Vec_CUSP*)v->spptr)->GPUvector->copyToGPUSome(varray, ci->recvIndices);CHKERRCUSP(ierr);
148: #else
149:     Vec_Seq        *s;
150:     s = (Vec_Seq*)v->data;
151: 
152:     CUSPINTARRAYCPU *indicesCPU=&ci->recvIndicesCPU;
153:     CUSPINTARRAYGPU *indicesGPU=&ci->recvIndicesGPU;

155:     thrust::copy(thrust::make_permutation_iterator(s->array,indicesCPU->begin()),
156:                  thrust::make_permutation_iterator(s->array,indicesCPU->end()),
157:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()));
158: #endif
159:     // Set the buffer states
160:     v->valid_GPU_array = PETSC_CUSP_BOTH;
161:     PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
162:   }
163:   return(0);
164: }


169: /*
170:      VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
171: */
172: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
173: {
176:   VecCUSPAllocateCheckHost(v);
177:   if (v->valid_GPU_array == PETSC_CUSP_GPU){
178:     PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
179:     try{
180: #ifdef PETSC_HAVE_TXPETSCGPU
181:       ((Vec_CUSP*)v->spptr)->GPUvector->copyFromGPUAll();CHKERRCUSP(ierr);
182: #else
183:       CUSPARRAY      *varray;
184:       varray  = ((Vec_CUSP*)v->spptr)->GPUarray;
185:       thrust::copy(varray->begin(),varray->end(),*(PetscScalar**)v->data);
186:       WaitForGPU();CHKERRCUSP(ierr);
187: #endif
188:     } catch(char* ex) {
189:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
190:     }
191:     PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
192:     v->valid_GPU_array = PETSC_CUSP_BOTH;
193:   }
194:   return(0);
195: }

199: /* Note that this function only copies *some* of the values up from the GPU to CPU,
200:    which means that we need recombine the data at some point before using any of the standard functions.
201:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
202:    where you have to always call in pairs
203: */
204: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
205: {
206:   CUSPARRAY      *varray;

210:   VecCUSPAllocateCheck(v);
211:   VecCUSPAllocateCheckHost(v);
212:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
213:     PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
214:     varray  = ((Vec_CUSP*)v->spptr)->GPUarray;
215: #ifdef PETSC_HAVE_TXPETSCGPU
216:     ((Vec_CUSP*)v->spptr)->GPUvector->copyFromGPUSome(varray, ci->sendIndices);CHKERRCUSP(ierr);
217: #else    
218:   Vec_Seq        *s;
219:   s = (Vec_Seq*)v->data;
220:     CUSPINTARRAYCPU *indicesCPU=&ci->sendIndicesCPU;
221:     CUSPINTARRAYGPU *indicesGPU=&ci->sendIndicesGPU;
222: 
223:     thrust::copy(thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
224:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
225:                  thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
226: #endif
227:     VecCUSPRestoreArrayRead(v,&varray);
228:     PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
229:     v->valid_GPU_array = PETSC_CUSP_BOTH;
230:   }
231:   return(0);
232: }


237: static PetscErrorCode VecCopy_Seq(Vec xin,Vec yin)
238: {
239:   PetscScalar       *ya;
240:   const PetscScalar *xa;
241:   PetscErrorCode    ierr;

244:   if (xin != yin) {
245:     VecGetArrayRead(xin,&xa);
246:     VecGetArray(yin,&ya);
247:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
248:     VecRestoreArrayRead(xin,&xa);
249:     VecRestoreArray(yin,&ya);
250:   }
251:   return(0);
252: }

256: static PetscErrorCode VecSetRandom_Seq(Vec xin,PetscRandom r)
257: {
259:   PetscInt       n = xin->map->n,i;
260:   PetscScalar    *xx;

263:   VecGetArray(xin,&xx);
264:   for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
265:   VecRestoreArray(xin,&xx);
266:   return(0);
267: }

271: static PetscErrorCode VecDestroy_Seq(Vec v)
272: {
273:   Vec_Seq        *vs = (Vec_Seq*)v->data;

277:   PetscObjectDepublish(v);
278: #if defined(PETSC_USE_LOG)
279:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
280: #endif
281:   if (vs->array_allocated)
282:     PetscFree(vs->array_allocated);
283:   PetscFree(vs);
284:   return(0);
285: }

289: static PetscErrorCode VecResetArray_Seq(Vec vin)
290: {
291:   Vec_Seq *v = (Vec_Seq *)vin->data;

294:   v->array         = v->unplacedarray;
295:   v->unplacedarray = 0;
296:   return(0);
297: }

299: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
302: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
303: {

307:   VecCUSPAllocateCheck(v);
308:   return(0);
309: }

313: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
314: {

318:   VecCUSPCopyToGPU(v);
319:   return(0);
320: }

324: /*
325:     PetscCUSPIndicesCreate - creates the data structure needed by VecCUSPCopyToGPUSome_Public()

327:    Input Parameters:
328: +    n - the number of indices
329: -    indices - integer list of indices

331:    Output Parameter:
332: .    ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()

334: .seealso: PetscCUSPIndicesDestroy(), VecCUSPCopyToGPUSome_Public()
335: */
336: PetscErrorCode PetscCUSPIndicesCreate(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
337: {
338:   PetscCUSPIndices  cci;

341:   cci = new struct _p_PetscCUSPIndices;
342: #ifdef PETSC_HAVE_TXPETSCGPU
343:   cci->sendIndices = new GPU_Indices<PetscInt, PetscScalar>();
344:   cci->sendIndices->buildIndices(sendIndices, ns);
345:   cci->recvIndices = new GPU_Indices<PetscInt, PetscScalar>();
346:   cci->recvIndices->buildIndices(recvIndices, nr);
347: #else
348:   cci->sendIndicesCPU.assign(sendIndices,sendIndices+ns);
349:   cci->sendIndicesGPU.assign(sendIndices,sendIndices+ns);

351:   cci->recvIndicesCPU.assign(recvIndices,recvIndices+nr);
352:   cci->recvIndicesGPU.assign(recvIndices,recvIndices+nr);
353: #endif  
354:   *ci = cci;
355:   return(0);
356: }

360: /*
361:     PetscCUSPIndicesDestroy - destroys the data structure needed by VecCUSPCopyToGPUSome_Public()

363:    Input Parameters:
364: .    ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()

366: .seealso: PetscCUSPIndicesCreate(), VecCUSPCopyToGPUSome_Public()
367: */
368: PetscErrorCode PetscCUSPIndicesDestroy(PetscCUSPIndices *ci)
369: {
371:   if (!(*ci)) return(0);
372:   try {
373: #ifdef PETSC_HAVE_TXPETSCGPU
374:     if ((*ci)->sendIndices) delete (*ci)->sendIndices;
375:     if ((*ci)->recvIndices) delete (*ci)->recvIndices;
376: #endif  
377:     if (ci) delete *ci;
378:   } catch(char* ex) {
379:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
380:   }
381:   *ci = 0;
382:   return(0);
383: }

385: #ifdef PETSC_HAVE_TXPETSCGPU
388: /*
389:  *VecCUSPResetIndexBuffersFlagsGPU_Public resets indexing flags ... only called in VecScatterFinalizeForGPU
390:  */
391: PetscErrorCode VecCUSPResetIndexBuffersFlagsGPU_Public(PetscCUSPIndices ci)
392: {
394:   if (ci->sendIndices)
395:     ci->sendIndices->resetStatusFlag();
396:   if (ci->recvIndices)
397:     ci->recvIndices->resetStatusFlag();
398:   return(0);
399: }
400: #endif  


405: /*
406:     VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

408:    Input Parameters:
409: +    v - the vector
410: -    indices - the requested indices, this should be created with CUSPIndicesCreate()

412: */
413: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
414: {
415:   PetscErrorCode   ierr;

418:   VecCUSPCopyToGPUSome(v,ci);
419:   return(0);
420: }

424: /*
425:   VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

427:   Input Parameters:
428:  +    v - the vector
429:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
430: */
431: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
432: {

436:   VecCUSPCopyFromGPUSome(v,ci);
437:   return(0);
438: }

440: #ifdef PETSC_HAVE_TXPETSCGPU
443: /* Note that this function only moves *some* of the data from a GPU vector to a contiguous buffer on the GPU.
444:    Afterwords, this buffer can be messaged to the host easily with asynchronous memory transfers.
445:    which means that we need recombine the data at some point before using any of the standard functions.
446:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
447:    where you have to always call in pairs
448: */
449: PetscErrorCode VecCUSPCopySomeToContiguousBufferGPU(Vec v, PetscCUSPIndices ci)
450: {
451:   CUSPARRAY      *varray;
454:   VecCUSPAllocateCheck(v);
455:   if (v->valid_GPU_array == PETSC_CUSP_GPU || v->valid_GPU_array == PETSC_CUSP_BOTH) {
456:     VecCUSPGetArrayRead(v,&varray);
457:     ((Vec_CUSP*)v->spptr)->GPUvector->copySomeToContiguousBuffer(varray, ci->sendIndices);CHKERRCUSP(ierr);
458:     VecCUSPRestoreArrayRead(v,&varray);
459:   }
460:   return(0);
461: }



467: /*
468:   VecCUSPCopySomeToContiguousBufferGPU_Public - Copies certain entries to a contiguous buffer on the GPU from the GPU of a vector

470:   Input Parameters:
471:  +    v - the vector
472:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
473: */
474: PetscErrorCode VecCUSPCopySomeToContiguousBufferGPU_Public(Vec v, PetscCUSPIndices ci)
475: {

479:   VecCUSPCopySomeToContiguousBufferGPU(v,ci);
480:   return(0);
481: }

483: /* Note that this function only moves *some* of the data from a contiguous buffer on the GPU to arbitrary locations 
484:    in a GPU vector. This function will typically be called after an asynchronous memory transfer from the host to the device.
485:    which means that we need recombine the data at some point before using any of the standard functions.
486:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
487:    where you have to always call in pairs
488: */
489: PetscErrorCode VecCUSPCopySomeFromContiguousBufferGPU(Vec v, PetscCUSPIndices ci)
490: {
491:   CUSPARRAY      *varray;
494:   VecCUSPAllocateCheck(v);
495:   if (v->valid_GPU_array == PETSC_CUSP_CPU  || v->valid_GPU_array == PETSC_CUSP_BOTH) {
496:     VecCUSPGetArrayRead(v,&varray);
497:     ((Vec_CUSP*)v->spptr)->GPUvector->copySomeFromContiguousBuffer(varray, ci->recvIndices);CHKERRCUSP(ierr);
498:     VecCUSPRestoreArrayRead(v,&varray);
499:   }
500:   return(0);
501: }

505: /*
506:   VecCUSPCopySomeToContiguousBufferGPU_Public - Copies certain entries to a contiguous buffer on the GPU from the GPU of a vector

508:   Input Parameters:
509:  +    v - the vector
510:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
511: */
512: PetscErrorCode VecCUSPCopySomeFromContiguousBufferGPU_Public(Vec v, PetscCUSPIndices ci)
513: {

517:   VecCUSPCopySomeFromContiguousBufferGPU(v,ci);
518:   return(0);
519: }

521: #endif


524: /*MC
525:    VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP

527:    Options Database Keys:
528: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()

530:   Level: beginner

532: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
533: M*/

535: /* for VecAYPX_SeqCUSP*/
536: namespace cusp
537: {
538: namespace blas
539: {
540: namespace detail
541: {
542:   template <typename T>
543:     struct AYPX : public thrust::binary_function<T,T,T>
544:     {
545:       T alpha;

547:       AYPX(T _alpha) : alpha(_alpha) {}

549:       __host__ __device__
550:         T operator()(T x, T y)
551:       {
552:         return alpha * y + x;
553:       }
554:     };
555: }

557:  template <typename ForwardIterator1,
558:            typename ForwardIterator2,
559:            typename ScalarType>
560: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
561:            {
562:              thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
563:            }
564:  template <typename Array1, typename Array2, typename ScalarType>
565:    void aypx(const Array1& x, Array2& y, ScalarType alpha)
566:  {
567:    detail::assert_same_dimensions(x,y);
568:    aypx(x.begin(),x.end(),y.begin(),alpha);
569:  }
570: }
571: }

575: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
576: {
577:   CUSPARRAY      *xarray,*yarray;

581:   if (alpha != 0.0) {
582:     VecCUSPGetArrayRead(xin,&xarray);
583:     VecCUSPGetArrayReadWrite(yin,&yarray);
584:     try{
585:       cusp::blas::aypx(*xarray,*yarray,alpha);
586:       WaitForGPU();CHKERRCUSP(ierr);
587:     } catch(char* ex) {
588:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
589:     }
590:     VecCUSPRestoreArrayRead(xin,&xarray);
591:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
592:     PetscLogFlops(2.0*yin->map->n);
593:    }
594:   return(0);
595: }


600: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
601: {
602:   CUSPARRAY      *xarray,*yarray;

606:   if (alpha != 0.0) {
607:     VecCUSPGetArrayRead(xin,&xarray);
608:     VecCUSPGetArrayReadWrite(yin,&yarray);
609:     try {
610:       cusp::blas::axpy(*xarray,*yarray,alpha);
611:       WaitForGPU();CHKERRCUSP(ierr);
612:     } catch(char* ex) {
613:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
614:     }
615:     VecCUSPRestoreArrayRead(xin,&xarray);
616:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
617:     PetscLogFlops(2.0*yin->map->n);
618:   }
619:   return(0);
620: }

622: struct VecCUSPPointwiseDivide
623: {
624:   template <typename Tuple>
625:   __host__ __device__
626:   void operator()(Tuple t)
627:   {
628:     thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
629:   }
630: };

634: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
635: {
636:   CUSPARRAY      *warray,*xarray,*yarray;

640:   VecCUSPGetArrayRead(xin,&xarray);
641:   VecCUSPGetArrayRead(yin,&yarray);
642:   VecCUSPGetArrayWrite(win,&warray);
643:   try{
644:     thrust::for_each(
645:         thrust::make_zip_iterator(
646:             thrust::make_tuple(
647:                 warray->begin(),
648:                 xarray->begin(),
649:                 yarray->begin())),
650:         thrust::make_zip_iterator(
651:             thrust::make_tuple(
652:                 warray->end(),
653:                 xarray->end(),
654:                 yarray->end())),
655:         VecCUSPPointwiseDivide());
656:   WaitForGPU();CHKERRCUSP(ierr);
657:   } catch(char* ex) {
658:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
659:     }
660:   PetscLogFlops(win->map->n);
661:   VecCUSPRestoreArrayRead(xin,&xarray);
662:   VecCUSPRestoreArrayRead(yin,&yarray);
663:   VecCUSPRestoreArrayWrite(win,&warray);
664:   return(0);
665: }


668: struct VecCUSPWAXPY
669: {
670:   template <typename Tuple>
671:   __host__ __device__
672:   void operator()(Tuple t)
673:   {
674:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
675:   }
676: };

678: struct VecCUSPSum
679: {
680:   template <typename Tuple>
681:   __host__ __device__
682:   void operator()(Tuple t)
683:   {
684:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
685:   }
686: };

688: struct VecCUSPDiff
689: {
690:   template <typename Tuple>
691:   __host__ __device__
692:   void operator()(Tuple t)
693:   {
694:     thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
695:   }
696: };

700: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
701: {
702:   CUSPARRAY      *xarray,*yarray,*warray;

706:     if (alpha == 0.0) {
707:     VecCopy_SeqCUSP(yin,win);
708:   } else {
709:       VecCUSPGetArrayRead(xin,&xarray);
710:       VecCUSPGetArrayRead(yin,&yarray);
711:       VecCUSPGetArrayWrite(win,&warray);
712:       if (alpha == 1.0) {
713:         try {
714:           thrust::for_each(
715:             thrust::make_zip_iterator(
716:               thrust::make_tuple(
717:                 warray->begin(),
718:                 yarray->begin(),
719:                 xarray->begin())),
720:             thrust::make_zip_iterator(
721:               thrust::make_tuple(
722:                 warray->end(),
723:                 yarray->end(),
724:                 xarray->end())),
725:             VecCUSPSum());
726:         } catch(char* ex) {
727:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
728:         }
729:         PetscLogFlops(win->map->n);
730:       } else if (alpha == -1.0) {
731:         try {
732:           thrust::for_each(
733:             thrust::make_zip_iterator(
734:               thrust::make_tuple(
735:                 warray->begin(),
736:                 yarray->begin(),
737:                 xarray->begin())),
738:             thrust::make_zip_iterator(
739:               thrust::make_tuple(
740:                 warray->end(),
741:                 yarray->end(),
742:                 xarray->end())),
743:             VecCUSPDiff());
744:         } catch(char* ex) {
745:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
746:         }
747:         PetscLogFlops(win->map->n);
748:       } else {
749:         try {
750:           thrust::for_each(
751:             thrust::make_zip_iterator(
752:               thrust::make_tuple(
753:                 warray->begin(),
754:                 yarray->begin(),
755:                 thrust::make_constant_iterator(alpha),
756:                 xarray->begin())),
757:             thrust::make_zip_iterator(
758:               thrust::make_tuple(
759:                 warray->end(),
760:                 yarray->end(),
761:                 thrust::make_constant_iterator(alpha),
762:                 xarray->end())),
763:             VecCUSPWAXPY());
764:         } catch(char* ex) {
765:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
766:         }
767:         PetscLogFlops(2*win->map->n);
768:       }
769:       WaitForGPU();CHKERRCUSP(ierr);
770:       VecCUSPRestoreArrayRead(xin,&xarray);
771:       VecCUSPRestoreArrayRead(yin,&yarray);
772:       VecCUSPRestoreArrayWrite(win,&warray);
773:     }
774:     return(0);
775: }

777: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
778: struct VecCUSPMAXPY4
779: {
780:   template <typename Tuple>
781:   __host__ __device__
782:   void operator()(Tuple t)
783:   {
784:     /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
785:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
786:   }
787: };


790: struct VecCUSPMAXPY3
791: {
792:   template <typename Tuple>
793:   __host__ __device__
794:   void operator()(Tuple t)
795:   {
796:     /*y += a1*x1 +a2*x2 + 13*x3 */
797:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
798:   }
799: };

801: struct VecCUSPMAXPY2
802: {
803:   template <typename Tuple>
804:   __host__ __device__
805:   void operator()(Tuple t)
806:   {
807:     /*y += a1*x1 +a2*x2*/
808:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
809:   }
810: };
813: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
814: {
815:   PetscErrorCode    ierr;
816:   CUSPARRAY         *xarray,*yy0,*yy1,*yy2,*yy3;
817:   PetscInt          n = xin->map->n,j,j_rem;
818:   PetscScalar       alpha0,alpha1,alpha2,alpha3;

821:   PetscLogFlops(nv*2.0*n);
822:   VecCUSPGetArrayReadWrite(xin,&xarray);
823:   switch (j_rem=nv&0x3) {
824:   case 3:
825:     alpha0 = alpha[0];
826:     alpha1 = alpha[1];
827:     alpha2 = alpha[2];
828:     alpha += 3;
829:     VecCUSPGetArrayRead(y[0],&yy0);
830:     VecCUSPGetArrayRead(y[1],&yy1);
831:     VecCUSPGetArrayRead(y[2],&yy2);
832:     try {
833:       thrust::for_each(
834:         thrust::make_zip_iterator(
835:             thrust::make_tuple(
836:                 xarray->begin(),
837:                 thrust::make_constant_iterator(alpha0),
838:                 yy0->begin(),
839:                 thrust::make_constant_iterator(alpha1),
840:                 yy1->begin(),
841:                 thrust::make_constant_iterator(alpha2),
842:                 yy2->begin())),
843:         thrust::make_zip_iterator(
844:             thrust::make_tuple(
845:                 xarray->end(),
846:                 thrust::make_constant_iterator(alpha0),
847:                 yy0->end(),
848:                 thrust::make_constant_iterator(alpha1),
849:                 yy1->end(),
850:                 thrust::make_constant_iterator(alpha2),
851:                 yy2->end())),
852:         VecCUSPMAXPY3());
853:     } catch(char* ex) {
854:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
855:     }
856:     VecCUSPRestoreArrayRead(y[0],&yy0);
857:     VecCUSPRestoreArrayRead(y[1],&yy1);
858:     VecCUSPRestoreArrayRead(y[2],&yy2);
859:     y     += 3;
860:     break;
861:   case 2:
862:     alpha0 = alpha[0];
863:     alpha1 = alpha[1];
864:     alpha +=2;
865:     VecCUSPGetArrayRead(y[0],&yy0);
866:     VecCUSPGetArrayRead(y[1],&yy1);
867:     try {
868:       thrust::for_each(
869:         thrust::make_zip_iterator(
870:             thrust::make_tuple(
871:                 xarray->begin(),
872:                 thrust::make_constant_iterator(alpha0),
873:                 yy0->begin(),
874:                 thrust::make_constant_iterator(alpha1),
875:                 yy1->begin())),
876:         thrust::make_zip_iterator(
877:             thrust::make_tuple(
878:                 xarray->end(),
879:                 thrust::make_constant_iterator(alpha0),
880:                 yy0->end(),
881:                 thrust::make_constant_iterator(alpha1),
882:                 yy1->end())),
883:         VecCUSPMAXPY2());
884:     } catch(char* ex) {
885:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
886:     }
887:     y     +=2;
888:     break;
889:   case 1:
890:     alpha0 = *alpha++;
891:     VecAXPY_SeqCUSP(xin,alpha0,y[0]);
892:     y     +=1;
893:     break;
894:   }
895:   for (j=j_rem; j<nv; j+=4) {
896:     alpha0 = alpha[0];
897:     alpha1 = alpha[1];
898:     alpha2 = alpha[2];
899:     alpha3 = alpha[3];
900:     alpha  += 4;
901:     VecCUSPGetArrayRead(y[0],&yy0);
902:     VecCUSPGetArrayRead(y[1],&yy1);
903:     VecCUSPGetArrayRead(y[2],&yy2);
904:     VecCUSPGetArrayRead(y[3],&yy3);
905:     try {
906:       thrust::for_each(
907:         thrust::make_zip_iterator(
908:             thrust::make_tuple(
909:                 xarray->begin(),
910:                 thrust::make_constant_iterator(alpha0),
911:                 yy0->begin(),
912:                 thrust::make_constant_iterator(alpha1),
913:                 yy1->begin(),
914:                 thrust::make_constant_iterator(alpha2),
915:                 yy2->begin(),
916:                 thrust::make_constant_iterator(alpha3),
917:                 yy3->begin())),
918:         thrust::make_zip_iterator(
919:             thrust::make_tuple(
920:                 xarray->end(),
921:                 thrust::make_constant_iterator(alpha0),
922:                 yy0->end(),
923:                 thrust::make_constant_iterator(alpha1),
924:                 yy1->end(),
925:                 thrust::make_constant_iterator(alpha2),
926:                 yy2->end(),
927:                 thrust::make_constant_iterator(alpha3),
928:                 yy3->end())),
929:         VecCUSPMAXPY4());
930:     } catch(char* ex) {
931:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
932:     }
933:     VecCUSPRestoreArrayRead(y[0],&yy0);
934:     VecCUSPRestoreArrayRead(y[1],&yy1);
935:     VecCUSPRestoreArrayRead(y[2],&yy2);
936:     VecCUSPRestoreArrayRead(y[3],&yy3);
937:     y      += 4;
938:   }
939:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
940:   WaitForGPU();CHKERRCUSP(ierr);
941:   return(0);
942: }


947: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
948: {
949: #if defined(PETSC_USE_COMPLEX)
950:   PetscScalar    *ya,*xa;
951: #endif
952:   CUSPARRAY      *xarray,*yarray;

956: #if defined(PETSC_USE_COMPLEX)
957:   /*Not working for complex*/
958: #else
959:   {
960:     VecCUSPGetArrayRead(xin,&xarray);
961:     VecCUSPGetArrayRead(yin,&yarray);
962:     try {
963:       *z = cusp::blas::dot(*xarray,*yarray);
964:     } catch(char* ex) {
965:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
966:     }
967:   }
968: #endif
969:  WaitForGPU();CHKERRCUSP(ierr);
970:  if (xin->map->n >0) {
971:     PetscLogFlops(2.0*xin->map->n-1);
972:   }
973:  VecCUSPRestoreArrayRead(xin,&xarray);
974:  VecCUSPRestoreArrayRead(yin,&yarray);
975:  return(0);
976: }

978: /*The following few template functions are for VecMDot_SeqCUSP*/

980: template <typename T1,typename T2>
981: struct cuspmult2 : thrust::unary_function<T1,T2>
982: {
983:         __host__ __device__
984:         T2 operator()(T1 x)
985:         {
986:                 return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x));
987:         }
988: };

990: template <typename T>
991: struct cuspadd2 : thrust::binary_function<T,T,T>
992: {
993:         __host__ __device__
994:         T operator()(T x,T y)
995:         {
996:                 return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y));
997:         }
998: };

1000: template <typename T1,typename T2>
1001: struct cuspmult3 : thrust::unary_function<T1,T2>
1002: {
1003:         __host__ __device__
1004:         T2 operator()(T1 x)
1005:         {
1006:           return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x),thrust::get<0>(x)*thrust::get<3>(x));
1007:         }
1008: };

1010: template <typename T>
1011: struct cuspadd3 : thrust::binary_function<T,T,T>
1012: {
1013:         __host__ __device__
1014:         T operator()(T x,T y)
1015:         {
1016:           return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y),thrust::get<2>(x)+thrust::get<2>(y));
1017:         }
1018: };
1019:         template <typename T1,typename T2>
1020: struct cuspmult4 : thrust::unary_function<T1,T2>
1021: {
1022:         __host__ __device__
1023:         T2 operator()(T1 x)
1024:         {
1025:           return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x),thrust::get<0>(x)*thrust::get<3>(x),thrust::get<0>(x)*thrust::get<4>(x));
1026:         }
1027: };

1029: template <typename T>
1030: struct cuspadd4 : thrust::binary_function<T,T,T>
1031: {
1032:         __host__ __device__
1033:         T operator()(T x,T y)
1034:         {
1035:           return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y),thrust::get<2>(x)+thrust::get<2>(y),thrust::get<3>(x)+thrust::get<3>(y));
1036:         }
1037: };


1042: PetscErrorCode VecMDot_SeqCUSP_CUBLAS(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1043: {
1045:   PetscInt       n = xin->map->n,i;
1046:   CUSPARRAY      *xarray,*yy;
1047:   PetscScalar    *V,*xptr,*yptr,*zgpu;
1048:   Vec            *yyin = (Vec*)yin;
1049:   cublasStatus   stat;

1052:   stat = cublasAlloc(n*nv,sizeof(PetscScalar),(void**)&V);
1053:   if (stat!=CUBLAS_STATUS_SUCCESS) SETERRQ1(PETSC_COMM_SELF,1,"CUBLAS error %d",stat);
1054:   stat = cublasAlloc(nv,sizeof(PetscScalar),(void**)&zgpu);
1055:   if (stat!=CUBLAS_STATUS_SUCCESS) SETERRQ1(PETSC_COMM_SELF,1,"CUBLAS error %d",stat);
1056:   for (i=0;i<nv;i++) {
1057:     VecCUSPGetArrayRead(yyin[i],&yy);
1058:     yptr = thrust::raw_pointer_cast(&(*yy)[0]);
1059:     cudaMemcpy(V+i*n,yptr,n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1060:     VecCUSPRestoreArrayRead(yyin[i],&yy);
1061:   }
1062:   VecCUSPGetArrayRead(xin,&xarray);
1063:   xptr = thrust::raw_pointer_cast(&(*xarray)[0]);
1064: #if defined(PETSC_USE_REAL_DOUBLE)
1065: #define cublasXgemv cublasDgemv
1066: #else
1067: #define cublasXgemv cublasSgemv
1068: #endif
1069:   cublasXgemv('T',n,nv,1.0,V,n,xptr,1,0.0,zgpu,1.0);
1070:   cudaMemcpy(z,zgpu,nv*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1071:   VecCUSPRestoreArrayRead(xin,&xarray);
1072:   cublasFree(V);
1073:   cublasFree(zgpu);
1074:   WaitForGPU();CHKERRCUSP(ierr);
1075:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1076:   return(0);
1077: }

1081: PetscErrorCode VecMDot_SeqCUSP_CUBLAS_1(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1082: {
1084:   PetscInt       n = xin->map->n,i;
1085:   CUSPARRAY      *xarray,*yy;
1086:   PetscScalar    *V,*xptr,*yptr,*zgpu;
1087:   Vec            *yyin = (Vec*)yin;
1088:   cublasStatus   stat;
1089:   cudaError      cuerr;
1090:   size_t         pitch;

1093:   cuerr = cudaMallocPitch((void**)&V,&pitch,nv*sizeof(PetscScalar),n);
1094:   if (cuerr!=cudaSuccess) SETERRQ1(PETSC_COMM_SELF,1,"CUDA error %d",cuerr);
1095:   stat = cublasAlloc(nv,sizeof(PetscScalar),(void**)&zgpu);
1096:   if (stat!=CUBLAS_STATUS_SUCCESS) SETERRQ1(PETSC_COMM_SELF,1,"CUBLAS error %d",stat);
1097:   for (i=0;i<nv;i++) {
1098:     VecCUSPGetArrayRead(yyin[i],&yy);
1099:     yptr = thrust::raw_pointer_cast(&(*yy)[0]);
1100:     cuerr = cudaMemcpy2D(V+i,pitch,yptr,sizeof(PetscScalar),sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);
1101:     if (cuerr!=cudaSuccess) SETERRQ1(PETSC_COMM_SELF,1,"CUDA error %d",cuerr);
1102:     VecCUSPRestoreArrayRead(yyin[i],&yy);
1103:   }
1104:   VecCUSPGetArrayRead(xin,&xarray);
1105:   xptr = thrust::raw_pointer_cast(&(*xarray)[0]);
1106: #if defined(PETSC_USE_REAL_DOUBLE)
1107: #define cublasXgemv cublasDgemv
1108: #else
1109: #define cublasXgemv cublasSgemv
1110: #endif
1111:   cublasXgemv('N',nv,n,1.0,V,pitch/sizeof(PetscScalar),xptr,1,0.0,zgpu,1.0);
1112:   cudaMemcpy(z,zgpu,nv*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1113:   VecCUSPRestoreArrayRead(xin,&xarray);
1114:   cublasFree(V);
1115:   cublasFree(zgpu);
1116:   WaitForGPU();CHKERRCUSP(ierr);
1117:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1118:   return(0);
1119: }

1123: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1124: {
1125:   PetscErrorCode    ierr;
1126:   PetscInt          n = xin->map->n,j,j_rem;
1127:   /*Vec               yy0,yy1,yy2,yy3;*/
1128:   CUSPARRAY         *xarray,*yy0,*yy1,*yy2,*yy3;
1129:   PetscScalar       zero=0.0;
1130:   Vec               *yyin = (Vec*)yin;

1132:   thrust::tuple<PetscScalar,PetscScalar> result2;
1133:   thrust::tuple<PetscScalar,PetscScalar,PetscScalar> result3;
1134:   thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar>result4;

1137:   VecCUSPGetArrayRead(xin,&xarray);
1138:   switch(j_rem=nv&0x3) {
1139:   case 3:
1140:     VecCUSPGetArrayRead(yyin[0],&yy0);
1141:     VecCUSPGetArrayRead(yyin[1],&yy1);
1142:     VecCUSPGetArrayRead(yyin[2],&yy2);
1143:     try {
1144:       result3 = thrust::transform_reduce(
1145:                      thrust::make_zip_iterator(
1146:                           thrust::make_tuple(
1147:                                    xarray->begin(),
1148:                                    yy0->begin(),
1149:                                    yy1->begin(),
1150:                                    yy2->begin())),
1151:                      thrust::make_zip_iterator(
1152:                           thrust::make_tuple(
1153:                                    xarray->end(),
1154:                                    yy0->end(),
1155:                                    yy1->end(),
1156:                                    yy2->end())),
1157:                      cuspmult3<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar,PetscScalar> >(),
1158:                      thrust::make_tuple(zero,zero,zero), /*init */
1159:                      cuspadd3<thrust::tuple<PetscScalar,PetscScalar,PetscScalar> >()); /* binary function */
1160:       z[0] = thrust::get<0>(result3);
1161:       z[1] = thrust::get<1>(result3);
1162:       z[2] = thrust::get<2>(result3);
1163:     } catch(char* ex) {
1164:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1165:     }
1166:     z    += 3;
1167:     VecCUSPRestoreArrayRead(yyin[0],&yy0);
1168:     VecCUSPRestoreArrayRead(yyin[1],&yy1);
1169:     VecCUSPRestoreArrayRead(yyin[2],&yy2);
1170:     yyin  += 3;
1171:     break;
1172:   case 2:
1173:     VecCUSPGetArrayRead(yyin[0],&yy0);
1174:     VecCUSPGetArrayRead(yyin[1],&yy1);
1175:     try {
1176:       result2 = thrust::transform_reduce(
1177:                     thrust::make_zip_iterator(
1178:                         thrust::make_tuple(
1179:                                   xarray->begin(),
1180:                                   yy0->begin(),
1181:                                   yy1->begin())),
1182:                     thrust::make_zip_iterator(
1183:                         thrust::make_tuple(
1184:                                   xarray->end(),
1185:                                   yy0->end(),
1186:                                   yy1->end())),
1187:                     cuspmult2<thrust::tuple<PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar> >(),
1188:                     thrust::make_tuple(zero,zero), /*init */
1189:                     cuspadd2<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1190:       z[0] = thrust::get<0>(result2);
1191:       z[1] = thrust::get<1>(result2);
1192:     } catch(char* ex) {
1193:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1194:     }
1195:     z    += 2;
1196:     VecCUSPRestoreArrayRead(yyin[0],&yy0);
1197:     VecCUSPRestoreArrayRead(yyin[1],&yy1);
1198:     yyin  += 2;
1199:     break;
1200:   case 1:
1201:      VecDot_SeqCUSP(xin,yyin[0],&z[0]);
1202:     z    += 1;
1203:     yyin  += 1;
1204:     break;
1205:   }
1206:   for (j=j_rem; j<nv; j+=4) {
1207:     VecCUSPGetArrayRead(yyin[0],&yy0);
1208:     VecCUSPGetArrayRead(yyin[1],&yy1);
1209:     VecCUSPGetArrayRead(yyin[2],&yy2);
1210:     VecCUSPGetArrayRead(yyin[3],&yy3);
1211:     try {
1212:       result4 = thrust::transform_reduce(
1213:                     thrust::make_zip_iterator(
1214:                         thrust::make_tuple(
1215:                                   xarray->begin(),
1216:                                   yy0->begin(),
1217:                                   yy1->begin(),
1218:                                   yy2->begin(),
1219:                                   yy3->begin())),
1220:                     thrust::make_zip_iterator(
1221:                         thrust::make_tuple(
1222:                                   xarray->end(),
1223:                                   yy0->end(),
1224:                                   yy1->end(),
1225:                                   yy2->end(),
1226:                                   yy3->end())),
1227:                      cuspmult4<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar> >(),
1228:                      thrust::make_tuple(zero,zero,zero,zero), /*init */
1229:                      cuspadd4<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar> >()); /* binary function */
1230:       z[0] = thrust::get<0>(result4);
1231:       z[1] = thrust::get<1>(result4);
1232:       z[2] = thrust::get<2>(result4);
1233:       z[3] = thrust::get<3>(result4);
1234:     } catch(char* ex) {
1235:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1236:     }
1237:     z    += 4;
1238:     VecCUSPRestoreArrayRead(yyin[0],&yy0);
1239:     VecCUSPRestoreArrayRead(yyin[1],&yy1);
1240:     VecCUSPRestoreArrayRead(yyin[2],&yy2);
1241:     VecCUSPRestoreArrayRead(yyin[3],&yy3);
1242:     yyin  += 4;
1243:   }
1244:   WaitForGPU();CHKERRCUSP(ierr);
1245:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1246:   return(0);
1247: }


1252: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1253: {
1254:   CUSPARRAY      *xarray;

1258:   /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1259:   VecCUSPGetArrayWrite(xin,&xarray);
1260:   try {
1261:     cusp::blas::fill(*xarray,alpha);
1262:   } catch(char* ex) {
1263:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1264:   }
1265:   WaitForGPU();CHKERRCUSP(ierr);
1266:   VecCUSPRestoreArrayWrite(xin,&xarray);
1267:   return(0);
1268: }

1272: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1273: {
1274:   CUSPARRAY      *xarray;

1278:   if (alpha == 0.0) {
1279:     VecSet_SeqCUSP(xin,alpha);
1280:   } else if (alpha != 1.0) {
1281:     VecCUSPGetArrayReadWrite(xin,&xarray);
1282:     try {
1283:       cusp::blas::scal(*xarray,alpha);
1284:     } catch(char* ex) {
1285:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1286:     }
1287:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1288:   }
1289:   WaitForGPU();CHKERRCUSP(ierr);
1290:   PetscLogFlops(xin->map->n);
1291:   return(0);
1292: }


1297: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1298: {
1299: #if defined(PETSC_USE_COMPLEX)
1300:   PetscScalar    *ya,*xa;
1301: #endif
1302:   CUSPARRAY      *xarray,*yarray;

1306: #if defined(PETSC_USE_COMPLEX)
1307:   /*Not working for complex*/
1308: #else
1309:  VecCUSPGetArrayRead(xin,&xarray);
1310:  VecCUSPGetArrayRead(yin,&yarray);
1311:  try {
1312:    *z = cusp::blas::dot(*xarray,*yarray);
1313:  } catch(char* ex) {
1314:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1315:  }
1316: #endif
1317:  WaitForGPU();CHKERRCUSP(ierr);
1318:   if (xin->map->n > 0) {
1319:     PetscLogFlops(2.0*xin->map->n-1);
1320:   }
1321:   VecCUSPRestoreArrayRead(yin,&yarray);
1322:   VecCUSPRestoreArrayRead(xin,&xarray);
1323:   return(0);
1324: }
1327: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1328: {
1329:   CUSPARRAY      *xarray,*yarray;

1333:   if (xin != yin) {
1334:     if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1335:       VecCUSPGetArrayRead(xin,&xarray);
1336:       VecCUSPGetArrayWrite(yin,&yarray);
1337:        try {
1338:          cusp::blas::copy(*xarray,*yarray);
1339:        } catch(char* ex) {
1340:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1341:       }
1342:       WaitForGPU();CHKERRCUSP(ierr);
1343:       VecCUSPRestoreArrayRead(xin,&xarray);
1344:       VecCUSPRestoreArrayWrite(yin,&yarray);

1346:     } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1347:       /* copy in CPU if we are on the CPU*/
1348:       VecCopy_Seq(xin,yin);
1349:     } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1350:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1351:       if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1352:         /* copy in CPU */
1353:         VecCopy_Seq(xin,yin);

1355:       } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1356:         /* copy in GPU */
1357:         VecCUSPGetArrayRead(xin,&xarray);
1358:         VecCUSPGetArrayWrite(yin,&yarray);
1359:         try {
1360:           cusp::blas::copy(*xarray,*yarray);
1361:           WaitForGPU();CHKERRCUSP(ierr);
1362:         } catch(char* ex) {
1363:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1364:         }
1365:         VecCUSPRestoreArrayRead(xin,&xarray);
1366:         VecCUSPRestoreArrayWrite(yin,&yarray);
1367:       } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1368:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1369:            default to copy in GPU (this is an arbitrary choice) */
1370:         VecCUSPGetArrayRead(xin,&xarray);
1371:         VecCUSPGetArrayWrite(yin,&yarray);
1372:         try {
1373:           cusp::blas::copy(*xarray,*yarray);
1374:           WaitForGPU();CHKERRCUSP(ierr);
1375:         } catch(char* ex) {
1376:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1377:         }
1378:         VecCUSPRestoreArrayRead(xin,&xarray);
1379:         VecCUSPRestoreArrayWrite(yin,&yarray);
1380:       } else {
1381:         VecCopy_Seq(xin,yin);
1382:       }
1383:     }
1384:   }
1385:   return(0);
1386: }


1391: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1392: {
1394:   PetscBLASInt   one = 1,bn = PetscBLASIntCast(xin->map->n);
1395:   CUSPARRAY      *xarray,*yarray;

1398:   if (xin != yin) {
1399:     VecCUSPGetArrayReadWrite(xin,&xarray);
1400:     VecCUSPGetArrayReadWrite(yin,&yarray);
1401: #if defined(PETSC_USE_REAL_SINGLE)
1402:     cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1403: #else
1404:     cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1405: #endif
1406:     cublasGetError();CHKERRCUSP(ierr);
1407:     WaitForGPU();CHKERRCUSP(ierr);
1408:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1409:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1410:   }
1411:   return(0);
1412: }

1414: struct VecCUSPAX
1415: {
1416:   template <typename Tuple>
1417:   __host__ __device__
1418:   void operator()(Tuple t)
1419:   {
1420:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1421:   }
1422: };
1425: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1426: {
1427:   PetscErrorCode    ierr;
1428:   PetscScalar       a = alpha,b = beta;
1429:   CUSPARRAY         *xarray,*yarray;

1432:   if (a == 0.0) {
1433:     VecScale_SeqCUSP(yin,beta);
1434:   } else if (b == 1.0) {
1435:     VecAXPY_SeqCUSP(yin,alpha,xin);
1436:   } else if (a == 1.0) {
1437:     VecAYPX_SeqCUSP(yin,beta,xin);
1438:   } else if (b == 0.0) {
1439:     VecCUSPGetArrayRead(xin,&xarray);
1440:     VecCUSPGetArrayReadWrite(yin,&yarray);
1441:     try {
1442:       thrust::for_each(
1443:         thrust::make_zip_iterator(
1444:             thrust::make_tuple(
1445:                 yarray->begin(),
1446:                 thrust::make_constant_iterator(a),
1447:                 xarray->begin())),
1448:         thrust::make_zip_iterator(
1449:             thrust::make_tuple(
1450:                 yarray->end(),
1451:                 thrust::make_constant_iterator(a),
1452:                 xarray->end())),
1453:         VecCUSPAX());
1454:     } catch(char* ex) {
1455:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1456:     }
1457:     PetscLogFlops(xin->map->n);
1458:     WaitForGPU();CHKERRCUSP(ierr);
1459:     VecCUSPRestoreArrayRead(xin,&xarray);
1460:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1461:   } else {
1462:     VecCUSPGetArrayRead(xin,&xarray);
1463:     VecCUSPGetArrayReadWrite(yin,&yarray);
1464:     try {
1465:       cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1466:     } catch(char* ex) {
1467:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1468:     }
1469:     VecCUSPRestoreArrayRead(xin,&xarray);
1470:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1471:     WaitForGPU();CHKERRCUSP(ierr);
1472:     PetscLogFlops(3.0*xin->map->n);
1473:   }
1474:   return(0);
1475: }

1477: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1478: struct VecCUSPXPBYPCZ
1479: {
1480:   /* z = x + b*y + c*z */
1481:   template <typename Tuple>
1482:   __host__ __device__
1483:   void operator()(Tuple t)
1484:   {
1485:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1486:   }
1487: };
1488: struct VecCUSPAXPBYPZ
1489: {
1490:   /* z = ax + b*y + z */
1491:   template <typename Tuple>
1492:   __host__ __device__
1493:   void operator()(Tuple t)
1494:   {
1495:     thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1496:   }
1497: };

1501: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1502: {
1503:   PetscErrorCode     ierr;
1504:   PetscInt           n = zin->map->n;
1505:   CUSPARRAY          *xarray,*yarray,*zarray;

1508:   VecCUSPGetArrayRead(xin,&xarray);
1509:   VecCUSPGetArrayRead(yin,&yarray);
1510:   VecCUSPGetArrayReadWrite(zin,&zarray);
1511:   if (alpha == 1.0) {
1512:     try {
1513:       thrust::for_each(
1514:         thrust::make_zip_iterator(
1515:             thrust::make_tuple(
1516:                 zarray->begin(),
1517:                 thrust::make_constant_iterator(gamma),
1518:                 xarray->begin(),
1519:                 yarray->begin(),
1520:                 thrust::make_constant_iterator(beta))),
1521:         thrust::make_zip_iterator(
1522:             thrust::make_tuple(
1523:                 zarray->end(),
1524:                 thrust::make_constant_iterator(gamma),
1525:                 xarray->end(),
1526:                 yarray->end(),
1527:                 thrust::make_constant_iterator(beta))),
1528:         VecCUSPXPBYPCZ());
1529:     } catch(char* ex) {
1530:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1531:     }
1532:     PetscLogFlops(4.0*n);
1533:   } else if (gamma == 1.0) {
1534:     try {
1535:       thrust::for_each(
1536:         thrust::make_zip_iterator(
1537:             thrust::make_tuple(
1538:                 zarray->begin(),
1539:                 xarray->begin(),
1540:                 thrust::make_constant_iterator(alpha),
1541:                 yarray->begin(),
1542:                 thrust::make_constant_iterator(beta))),
1543:         thrust::make_zip_iterator(
1544:             thrust::make_tuple(
1545:                 zarray->end(),
1546:                 xarray->end(),
1547:                 thrust::make_constant_iterator(alpha),
1548:                 yarray->end(),
1549:                 thrust::make_constant_iterator(beta))),
1550:         VecCUSPAXPBYPZ());
1551:     } catch(char* ex) {
1552:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1553:     }
1554:     PetscLogFlops(4.0*n);
1555:   } else {
1556:     try {
1557:       cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1558:     } catch(char* ex) {
1559:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1560:     }
1561:     VecCUSPRestoreArrayReadWrite(zin,&zarray);
1562:     VecCUSPRestoreArrayRead(xin,&xarray);
1563:     VecCUSPRestoreArrayRead(yin,&yarray);
1564:     PetscLogFlops(5.0*n);
1565:   }
1566:   WaitForGPU();CHKERRCUSP(ierr);
1567:   return(0);
1568: }

1572: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1573: {
1575:   PetscInt       n = win->map->n;
1576:   CUSPARRAY      *xarray,*yarray,*warray;

1579:   VecCUSPGetArrayRead(xin,&xarray);
1580:   VecCUSPGetArrayRead(yin,&yarray);
1581:   VecCUSPGetArrayReadWrite(win,&warray);
1582:   try {
1583:     cusp::blas::xmy(*xarray,*yarray,*warray);
1584:   } catch(char* ex) {
1585:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1586:   }
1587:   VecCUSPRestoreArrayRead(xin,&xarray);
1588:   VecCUSPRestoreArrayRead(yin,&yarray);
1589:   VecCUSPRestoreArrayReadWrite(win,&warray);
1590:   PetscLogFlops(n);
1591:   WaitForGPU();CHKERRCUSP(ierr);
1592:   return(0);
1593: }


1596: /* should do infinity norm in cusp */

1600: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal* z)
1601: {
1602:   const PetscScalar *xx;
1603:   PetscErrorCode    ierr;
1604:   PetscInt          n = xin->map->n;
1605:   PetscBLASInt      one = 1, bn = PetscBLASIntCast(n);
1606:   CUSPARRAY         *xarray;

1609:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1610:     VecCUSPGetArrayRead(xin,&xarray);
1611:     try {
1612:       *z = cusp::blas::nrm2(*xarray);
1613:     } catch(char* ex) {
1614:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1615:     }
1616:     WaitForGPU();CHKERRCUSP(ierr);
1617:     VecCUSPRestoreArrayRead(xin,&xarray);
1618:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1619:   } else if (type == NORM_INFINITY) {
1620:     PetscInt     i;
1621:     PetscReal    max = 0.0,tmp;

1623:     VecGetArrayRead(xin,&xx);
1624:     for (i=0; i<n; i++) {
1625:       if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1626:       /* check special case of tmp == NaN */
1627:       if (tmp != tmp) {max = tmp; break;}
1628:       xx++;
1629:     }
1630:     VecRestoreArrayRead(xin,&xx);
1631:     *z   = max;
1632:   } else if (type == NORM_1) {
1633:     VecCUSPGetArrayRead(xin,&xarray);
1634: #if defined(PETSC_USE_REAL_SINGLE)
1635:     *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1636: #else
1637:     *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1638: #endif
1639:     cublasGetError();CHKERRCUSP(ierr);
1640:     VecCUSPRestoreArrayRead(xin,&xarray);
1641:     WaitForGPU();CHKERRCUSP(ierr);
1642:     PetscLogFlops(PetscMax(n-1.0,0.0));
1643:   } else if (type == NORM_1_AND_2) {
1644:     VecNorm_SeqCUSP(xin,NORM_1,z);
1645:     VecNorm_SeqCUSP(xin,NORM_2,z+1);
1646:   }
1647:   return(0);
1648: }


1651: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */

1655: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1656: {
1659:   VecSetRandom_Seq(xin,r);
1660:   xin->valid_GPU_array = PETSC_CUSP_CPU;
1661:   return(0);
1662: }

1666: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1667: {
1670:   VecCUSPCopyFromGPU(vin);
1671:   VecResetArray_Seq(vin);
1672:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1673:   return(0);
1674: }

1678: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1679: {
1682:   VecCUSPCopyFromGPU(vin);
1683:   VecPlaceArray_Seq(vin,a);
1684:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1685:   return(0);
1686: }


1691: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1692: {
1695:   VecCUSPCopyFromGPU(vin);
1696:   VecReplaceArray_Seq(vin,a);
1697:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1698:   return(0);
1699: }


1704: /*@
1705:    VecCreateSeqCUSP - Creates a standard, sequential array-style vector.

1707:    Collective on MPI_Comm

1709:    Input Parameter:
1710: +  comm - the communicator, should be PETSC_COMM_SELF
1711: -  n - the vector length

1713:    Output Parameter:
1714: .  V - the vector

1716:    Notes:
1717:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1718:    same type as an existing vector.

1720:    Level: intermediate

1722:    Concepts: vectors^creating sequential

1724: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1725: @*/
1726: PetscErrorCode  VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1727: {

1731:   VecCreate(comm,v);
1732:   VecSetSizes(*v,n,n);
1733:   VecSetType(*v,VECSEQCUSP);
1734:   return(0);
1735: }

1737: /*The following template functions are for VecDotNorm2_SeqCUSP.  Note that there is no complex support as currently written*/
1738: template <typename T>
1739: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1740: {
1741:         __host__ __device__
1742:         T operator()(T x)
1743:         {
1744:                 return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<1>(x)*thrust::get<1>(x));
1745:         }
1746: };

1748: template <typename T>
1749: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1750: {
1751:         __host__ __device__
1752:         T operator()(T x,T y)
1753:         {
1754:                 return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y));
1755:         }
1756: };

1760: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1761: {
1762:   PetscErrorCode                         ierr;
1763:   PetscScalar                            zero = 0.0,n=s->map->n;
1764:   thrust::tuple<PetscScalar,PetscScalar> result;
1765:   CUSPARRAY                              *sarray,*tarray;

1768:   /*VecCUSPCopyToGPU(s);
1769:    VecCUSPCopyToGPU(t);*/
1770:   VecCUSPGetArrayRead(s,&sarray);
1771:   VecCUSPGetArrayRead(t,&tarray);
1772:   try {
1773:     result = thrust::transform_reduce(
1774:                  thrust::make_zip_iterator(
1775:                      thrust::make_tuple(
1776:                          sarray->begin(),
1777:                          tarray->begin())),
1778:                  thrust::make_zip_iterator(
1779:                      thrust::make_tuple(
1780:                          sarray->end(),
1781:                          tarray->end())),
1782:                   cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1783:                   thrust::make_tuple(zero,zero), /*init */
1784:                   cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1785:     *dp = thrust::get<0>(result);
1786:     *nm = thrust::get<1>(result);
1787:   } catch(char* ex) {
1788:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1789:   }
1790:   VecCUSPRestoreArrayRead(s,&sarray);
1791:   VecCUSPRestoreArrayRead(t,&tarray);
1792:   WaitForGPU();CHKERRCUSP(ierr);
1793:   PetscLogFlops(4.0*n);
1794:   return(0);
1795: }

1799: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1800: {

1804:   VecCreateSeqCUSP(((PetscObject)win)->comm,win->map->n,V);
1805:   PetscLayoutReference(win->map,&(*V)->map);
1806:   PetscOListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1807:   PetscFListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1808:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1809:   return(0);
1810: }

1814: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1815: {
1818:   try {
1819:     if (v->spptr) {
1820: #ifdef PETSC_HAVE_TXPETSCGPU
1821:       if (((Vec_CUSP *)v->spptr)->GPUvector)
1822:         delete ((Vec_CUSP *)v->spptr)->GPUvector;
1823:       Vec_Seq        *s;
1824:       s = (Vec_Seq*)v->data;
1825:       s->array = PETSC_NULL;
1826:       s->array_allocated = PETSC_NULL;
1827: #endif
1828:       delete ((Vec_CUSP *)v->spptr)->GPUarray;
1829:       delete (Vec_CUSP *)v->spptr;
1830:     }
1831:   } catch(char* ex) {
1832:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1833:   }
1834:   VecDestroy_Seq(v);
1835:   return(0);
1836: }

1838: EXTERN_C_BEGIN
1841: PetscErrorCode  VecCreate_SeqCUSP(Vec V)
1842: {
1844:   PetscMPIInt    size;

1847:   MPI_Comm_size(((PetscObject)V)->comm,&size);
1848:   if  (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1849:   VecCreate_Seq_Private(V,0);
1850:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);
1851:   V->ops->dot             = VecDot_SeqCUSP;
1852:   V->ops->norm            = VecNorm_SeqCUSP;
1853:   V->ops->tdot            = VecTDot_SeqCUSP;
1854:   V->ops->scale           = VecScale_SeqCUSP;
1855:   V->ops->copy            = VecCopy_SeqCUSP;
1856:   V->ops->set             = VecSet_SeqCUSP;
1857:   V->ops->swap            = VecSwap_SeqCUSP;
1858:   V->ops->axpy            = VecAXPY_SeqCUSP;
1859:   V->ops->axpby           = VecAXPBY_SeqCUSP;
1860:   V->ops->axpbypcz        = VecAXPBYPCZ_SeqCUSP;
1861:   V->ops->pointwisemult   = VecPointwiseMult_SeqCUSP;
1862:   V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
1863:   V->ops->setrandom       = VecSetRandom_SeqCUSP;
1864:   V->ops->dot_local       = VecDot_SeqCUSP;
1865:   V->ops->tdot_local      = VecTDot_SeqCUSP;
1866:   V->ops->norm_local      = VecNorm_SeqCUSP;
1867:   V->ops->mdot_local      = VecMDot_SeqCUSP_CUBLAS;
1868:   V->ops->maxpy           = VecMAXPY_SeqCUSP;
1869:   V->ops->mdot            = VecMDot_SeqCUSP_CUBLAS;
1870:   V->ops->aypx            = VecAYPX_SeqCUSP;
1871:   V->ops->waxpy           = VecWAXPY_SeqCUSP;
1872:   V->ops->dotnorm2        = VecDotNorm2_SeqCUSP;
1873:   V->ops->placearray      = VecPlaceArray_SeqCUSP;
1874:   V->ops->replacearray    = VecReplaceArray_SeqCUSP;
1875:   V->ops->resetarray      = VecResetArray_SeqCUSP;
1876:   V->ops->destroy         = VecDestroy_SeqCUSP;
1877:   V->ops->duplicate       = VecDuplicate_SeqCUSP;

1879:   VecCUSPAllocateCheck(V);
1880:   V->valid_GPU_array      = PETSC_CUSP_GPU;
1881:   VecSet(V,0.0);
1882:   return(0);
1883: }
1884: EXTERN_C_END