Actual source code: vecviennacl.cxx

petsc-master 2019-06-15
Report Typos and Errors
  1: /*
  2:    Implements the sequential ViennaCL vectors.
  3: */

  5: #include <petscconf.h>
  6:  #include <petsc/private/vecimpl.h>
  7:  #include <../src/vec/vec/impls/dvecimpl.h>
  8:  #include <../src/vec/vec/impls/seq/seqviennacl/viennaclvecimpl.h>

 10: #include <vector>

 12: #include "viennacl/linalg/inner_prod.hpp"
 13: #include "viennacl/linalg/norm_1.hpp"
 14: #include "viennacl/linalg/norm_2.hpp"
 15: #include "viennacl/linalg/norm_inf.hpp"

 17: #ifdef VIENNACL_WITH_OPENCL
 18: #include "viennacl/ocl/backend.hpp"
 19: #endif


 22: PETSC_EXTERN PetscErrorCode VecViennaCLGetArray(Vec v, ViennaCLVector **a)
 23: {

 28:   *a   = 0;
 29:   VecViennaCLCopyToGPU(v);
 30:   *a   = ((Vec_ViennaCL*)v->spptr)->GPUarray;
 31:   ViennaCLWaitForGPU();
 32:   return(0);
 33: }

 35: PETSC_EXTERN PetscErrorCode VecViennaCLRestoreArray(Vec v, ViennaCLVector **a)
 36: {

 41:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

 43:   PetscObjectStateIncrease((PetscObject)v);
 44:   return(0);
 45: }

 47: PETSC_EXTERN PetscErrorCode VecViennaCLGetArrayRead(Vec v, const ViennaCLVector **a)
 48: {

 53:   *a   = 0;
 54:   VecViennaCLCopyToGPU(v);
 55:   *a   = ((Vec_ViennaCL*)v->spptr)->GPUarray;
 56:   ViennaCLWaitForGPU();
 57:   return(0);
 58: }

 60: PETSC_EXTERN PetscErrorCode VecViennaCLRestoreArrayRead(Vec v, const ViennaCLVector **a)
 61: {
 64:   return(0);
 65: }

 67: PETSC_EXTERN PetscErrorCode VecViennaCLGetArrayWrite(Vec v, ViennaCLVector **a)
 68: {

 73:   *a   = 0;
 74:   VecViennaCLAllocateCheck(v);
 75:   *a   = ((Vec_ViennaCL*)v->spptr)->GPUarray;
 76:   ViennaCLWaitForGPU();
 77:   return(0);
 78: }

 80: PETSC_EXTERN PetscErrorCode VecViennaCLRestoreArrayWrite(Vec v, ViennaCLVector **a)
 81: {

 86:   v->valid_GPU_array = PETSC_OFFLOAD_GPU;

 88:   PetscObjectStateIncrease((PetscObject)v);
 89:   return(0);
 90: }



 94: PETSC_EXTERN PetscErrorCode PetscViennaCLInit()
 95: {
 96:   PetscErrorCode       ierr;
 97:   char                 string[20];
 98:   PetscBool            flg,flg_cuda,flg_opencl,flg_openmp;

101:   /* ViennaCL backend selection: CUDA, OpenCL, or OpenMP */
102:   PetscOptionsGetString(NULL,NULL,"-viennacl_backend",string,12,&flg);
103:   if (flg) {
104:     try {
105:       PetscStrcasecmp(string,"cuda",&flg_cuda);
106:       PetscStrcasecmp(string,"opencl",&flg_opencl);
107:       PetscStrcasecmp(string,"openmp",&flg_openmp);

109:       /* A default (sequential) CPU backend is always available - even if OpenMP is not enabled. */
110:       if (flg_openmp) viennacl::backend::default_memory_type(viennacl::MAIN_MEMORY);
111: #if defined(PETSC_HAVE_CUDA)
112:       else if (flg_cuda) viennacl::backend::default_memory_type(viennacl::CUDA_MEMORY);
113: #endif
114: #if defined(PETSC_HAVE_OPENCL)
115:       else if (flg_opencl) viennacl::backend::default_memory_type(viennacl::OPENCL_MEMORY);
116: #endif
117:       else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: Backend not recognized or available: %s.\n Pass -viennacl_view to see available backends for ViennaCL.\n", string);
118:     } catch (std::exception const & ex) {
119:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
120:     }
121:   }

123: #if defined(PETSC_HAVE_OPENCL)
124:   /* ViennaCL OpenCL device type configuration */
125:   PetscOptionsGetString(NULL,NULL,"-viennacl_opencl_device_type",string,12,&flg);
126:   if (flg) {
127:     try {
128:       PetscStrcasecmp(string,"cpu",&flg);
129:       if (flg) viennacl::ocl::set_context_device_type(0, CL_DEVICE_TYPE_CPU);

131:       PetscStrcasecmp(string,"gpu",&flg);
132:       if (flg) viennacl::ocl::set_context_device_type(0, CL_DEVICE_TYPE_GPU);

134:       PetscStrcasecmp(string,"accelerator",&flg);
135:       if (flg) viennacl::ocl::set_context_device_type(0, CL_DEVICE_TYPE_ACCELERATOR);
136:     } catch (std::exception const & ex) {
137:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
138:     }
139:   }
140: #endif

142:   /* Print available backends */
143:   PetscOptionsHasName(NULL,NULL,"-viennacl_view",&flg);
144:   if (flg) {
145:     PetscPrintf(PETSC_COMM_WORLD, "ViennaCL backends available: ");
146: #if defined(PETSC_HAVE_CUDA)
147:     PetscPrintf(PETSC_COMM_WORLD, "CUDA, ");
148: #endif
149: #if defined(PETSC_HAVE_OPENCL)
150:     PetscPrintf(PETSC_COMM_WORLD, "OpenCL, ");
151: #endif
152: #if defined(PETSC_HAVE_OPENMP)
153:     PetscPrintf(PETSC_COMM_WORLD, "OpenMP ");
154: #else
155:     PetscPrintf(PETSC_COMM_WORLD, "OpenMP (1 thread) ");
156: #endif
157:     PetscPrintf(PETSC_COMM_WORLD, "\n");

159:     /* Print selected backends */
160:     PetscPrintf(PETSC_COMM_WORLD, "ViennaCL backend  selected: ");
161: #if defined(PETSC_HAVE_CUDA)
162:     if (viennacl::backend::default_memory_type() == viennacl::CUDA_MEMORY) {
163:       PetscPrintf(PETSC_COMM_WORLD, "CUDA ");
164:     }
165: #endif
166: #if defined(PETSC_HAVE_OPENCL)
167:     if (viennacl::backend::default_memory_type() == viennacl::OPENCL_MEMORY) {
168:       PetscPrintf(PETSC_COMM_WORLD, "OpenCL ");
169:     }
170: #endif
171: #if defined(PETSC_HAVE_OPENMP)
172:     if (viennacl::backend::default_memory_type() == viennacl::MAIN_MEMORY) {
173:       PetscPrintf(PETSC_COMM_WORLD, "OpenMP ");
174:     }
175: #else
176:     if (viennacl::backend::default_memory_type() == viennacl::MAIN_MEMORY) {
177:       PetscPrintf(PETSC_COMM_WORLD, "OpenMP (sequential - consider reconfiguration: --with-openmp=1) ");
178:     }
179: #endif
180:     PetscPrintf(PETSC_COMM_WORLD, "\n");
181:   }
182:   return(0);
183: }

185: /*
186:     Allocates space for the vector array on the Host if it does not exist.
187:     Does NOT change the PetscViennaCLFlag for the vector
188:     Does NOT zero the ViennaCL array
189:  */
190: PETSC_EXTERN PetscErrorCode VecViennaCLAllocateCheckHost(Vec v)
191: {
193:   PetscScalar    *array;
194:   Vec_Seq        *s;
195:   PetscInt       n = v->map->n;

198:   s    = (Vec_Seq*)v->data;
199:   VecViennaCLAllocateCheck(v);
200:   if (s->array == 0) {
201:     PetscMalloc1(n,&array);
202:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
203:     s->array           = array;
204:     s->array_allocated = array;
205:   }
206:   return(0);
207: }


210: /*
211:     Allocates space for the vector array on the GPU if it does not exist.
212:     Does NOT change the PetscViennaCLFlag for the vector
213:     Does NOT zero the ViennaCL array

215:  */
216: PetscErrorCode VecViennaCLAllocateCheck(Vec v)
217: {
219:   int            rank;

222:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
223:   // First allocate memory on the GPU if needed
224:   if (!v->spptr) {
225:     try {
226:       v->spptr                            = new Vec_ViennaCL;
227:       ((Vec_ViennaCL*)v->spptr)->GPUarray = new ViennaCLVector((PetscBLASInt)v->map->n);

229:     } catch(std::exception const & ex) {
230:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
231:     }
232:   }
233:   return(0);
234: }


237: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
238: PetscErrorCode VecViennaCLCopyToGPU(Vec v)
239: {

244:   VecViennaCLAllocateCheck(v);
245:   if (v->map->n > 0) {
246:     if (v->valid_GPU_array == PETSC_OFFLOAD_CPU) {
247:       PetscLogEventBegin(VEC_ViennaCLCopyToGPU,v,0,0,0);
248:       try {
249:         ViennaCLVector *vec = ((Vec_ViennaCL*)v->spptr)->GPUarray;
250:         viennacl::fast_copy(*(PetscScalar**)v->data, *(PetscScalar**)v->data + v->map->n, vec->begin());
251:         ViennaCLWaitForGPU();
252:       } catch(std::exception const & ex) {
253:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
254:       }
255:       PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
256:       PetscLogEventEnd(VEC_ViennaCLCopyToGPU,v,0,0,0);
257:       v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
258:     }
259:   }
260:   return(0);
261: }



265: /*
266:      VecViennaCLCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
267: */
268: PetscErrorCode VecViennaCLCopyFromGPU(Vec v)
269: {

274:   VecViennaCLAllocateCheckHost(v);
275:   if (v->valid_GPU_array == PETSC_OFFLOAD_GPU) {
276:     PetscLogEventBegin(VEC_ViennaCLCopyFromGPU,v,0,0,0);
277:     try {
278:       ViennaCLVector *vec = ((Vec_ViennaCL*)v->spptr)->GPUarray;
279:       viennacl::fast_copy(vec->begin(),vec->end(),*(PetscScalar**)v->data);
280:       ViennaCLWaitForGPU();
281:     } catch(std::exception const & ex) {
282:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
283:     }
284:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
285:     PetscLogEventEnd(VEC_ViennaCLCopyFromGPU,v,0,0,0);
286:     v->valid_GPU_array = PETSC_OFFLOAD_BOTH;
287:   }
288:   return(0);
289: }


292: /* Copy on CPU */
293: static PetscErrorCode VecCopy_SeqViennaCL_Private(Vec xin,Vec yin)
294: {
295:   PetscScalar       *ya;
296:   const PetscScalar *xa;
297:   PetscErrorCode    ierr;

300:   VecViennaCLAllocateCheckHost(xin);
301:   VecViennaCLAllocateCheckHost(yin);
302:   if (xin != yin) {
303:     VecGetArrayRead(xin,&xa);
304:     VecGetArray(yin,&ya);
305:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
306:     VecRestoreArrayRead(xin,&xa);
307:     VecRestoreArray(yin,&ya);
308:   }
309:   return(0);
310: }

312: static PetscErrorCode VecSetRandom_SeqViennaCL_Private(Vec xin,PetscRandom r)
313: {
315:   PetscInt       n = xin->map->n,i;
316:   PetscScalar    *xx;

319:   VecGetArray(xin,&xx);
320:   for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
321:   VecRestoreArray(xin,&xx);
322:   return(0);
323: }

325: static PetscErrorCode VecDestroy_SeqViennaCL_Private(Vec v)
326: {
327:   Vec_Seq        *vs = (Vec_Seq*)v->data;

331:   PetscObjectSAWsViewOff(v);
332: #if defined(PETSC_USE_LOG)
333:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
334: #endif
335:   if (vs->array_allocated) PetscFree(vs->array_allocated);
336:   PetscFree(vs);
337:   return(0);
338: }

340: static PetscErrorCode VecResetArray_SeqViennaCL_Private(Vec vin)
341: {
342:   Vec_Seq *v = (Vec_Seq*)vin->data;

345:   v->array         = v->unplacedarray;
346:   v->unplacedarray = 0;
347:   return(0);
348: }


351: /*MC
352:    VECSEQVIENNACL - VECSEQVIENNACL = "seqviennacl" - The basic sequential vector, modified to use ViennaCL

354:    Options Database Keys:
355: . -vec_type seqviennacl - sets the vector type to VECSEQVIENNACL during a call to VecSetFromOptions()

357:   Level: beginner

359: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
360: M*/


363: PetscErrorCode VecAYPX_SeqViennaCL(Vec yin, PetscScalar alpha, Vec xin)
364: {
365:   const ViennaCLVector  *xgpu;
366:   ViennaCLVector        *ygpu;
367:   PetscErrorCode        ierr;

370:   VecViennaCLGetArrayRead(xin,&xgpu);
371:   VecViennaCLGetArray(yin,&ygpu);
372:   try {
373:     if (alpha != 0.0 && xin->map->n > 0) {
374:       *ygpu = *xgpu + alpha * *ygpu;
375:       PetscLogFlops(2.0*yin->map->n);
376:     } else {
377:       *ygpu = *xgpu;
378:     }
379:     ViennaCLWaitForGPU();
380:   } catch(std::exception const & ex) {
381:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
382:   }
383:   VecViennaCLRestoreArrayRead(xin,&xgpu);
384:   VecViennaCLRestoreArray(yin,&ygpu);
385:   return(0);
386: }


389: PetscErrorCode VecAXPY_SeqViennaCL(Vec yin,PetscScalar alpha,Vec xin)
390: {
391:   const ViennaCLVector  *xgpu;
392:   ViennaCLVector        *ygpu;
393:   PetscErrorCode        ierr;

396:   if (alpha != 0.0 && xin->map->n > 0) {
397:     VecViennaCLGetArrayRead(xin,&xgpu);
398:     VecViennaCLGetArray(yin,&ygpu);
399:     try {
400:       *ygpu += alpha * *xgpu;
401:       ViennaCLWaitForGPU();
402:     } catch(std::exception const & ex) {
403:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
404:     }
405:     VecViennaCLRestoreArrayRead(xin,&xgpu);
406:     VecViennaCLRestoreArray(yin,&ygpu);
407:     PetscLogFlops(2.0*yin->map->n);
408:   }
409:   return(0);
410: }


413: PetscErrorCode VecPointwiseDivide_SeqViennaCL(Vec win, Vec xin, Vec yin)
414: {
415:   const ViennaCLVector  *xgpu,*ygpu;
416:   ViennaCLVector        *wgpu;
417:   PetscErrorCode        ierr;

420:   if (xin->map->n > 0) {
421:     VecViennaCLGetArrayRead(xin,&xgpu);
422:     VecViennaCLGetArrayRead(yin,&ygpu);
423:     VecViennaCLGetArrayWrite(win,&wgpu);
424:     try {
425:       *wgpu = viennacl::linalg::element_div(*xgpu, *ygpu);
426:       ViennaCLWaitForGPU();
427:     } catch(std::exception const & ex) {
428:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
429:     }
430:     PetscLogFlops(win->map->n);
431:     VecViennaCLRestoreArrayRead(xin,&xgpu);
432:     VecViennaCLRestoreArrayRead(yin,&ygpu);
433:     VecViennaCLRestoreArrayWrite(win,&wgpu);
434:   }
435:   return(0);
436: }


439: PetscErrorCode VecWAXPY_SeqViennaCL(Vec win,PetscScalar alpha,Vec xin, Vec yin)
440: {
441:   const ViennaCLVector  *xgpu,*ygpu;
442:   ViennaCLVector        *wgpu;
443:   PetscErrorCode        ierr;

446:   if (alpha == 0.0 && xin->map->n > 0) {
447:     VecCopy_SeqViennaCL(yin,win);
448:   } else {
449:     VecViennaCLGetArrayRead(xin,&xgpu);
450:     VecViennaCLGetArrayRead(yin,&ygpu);
451:     VecViennaCLGetArrayWrite(win,&wgpu);
452:     if (alpha == 1.0) {
453:       try {
454:         *wgpu = *ygpu + *xgpu;
455:       } catch(std::exception const & ex) {
456:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
457:       }
458:       PetscLogFlops(win->map->n);
459:     } else if (alpha == -1.0) {
460:       try {
461:         *wgpu = *ygpu - *xgpu;
462:       } catch(std::exception const & ex) {
463:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
464:       }
465:       PetscLogFlops(win->map->n);
466:     } else {
467:       try {
468:         *wgpu = *ygpu + alpha * *xgpu;
469:       } catch(std::exception const & ex) {
470:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
471:       }
472:       PetscLogFlops(2*win->map->n);
473:     }
474:     ViennaCLWaitForGPU();
475:     VecViennaCLRestoreArrayRead(xin,&xgpu);
476:     VecViennaCLRestoreArrayRead(yin,&ygpu);
477:     VecViennaCLRestoreArrayWrite(win,&wgpu);
478:   }
479:   return(0);
480: }


483: /*
484:  * Operation x = x + sum_i alpha_i * y_i for vectors x, y_i and scalars alpha_i
485:  *
486:  * ViennaCL supports a fast evaluation of x += alpha * y and x += alpha * y + beta * z,
487:  * hence there is an iterated application of these until the final result is obtained
488:  */
489: PetscErrorCode VecMAXPY_SeqViennaCL(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
490: {
492:   PetscInt       j;

495:   for (j = 0; j < nv; ++j) {
496:     if (j+1 < nv) {
497:       VecAXPBYPCZ_SeqViennaCL(xin,alpha[j],alpha[j+1],1.0,y[j],y[j+1]);
498:       ++j;
499:     } else {
500:       VecAXPY_SeqViennaCL(xin,alpha[j],y[j]);
501:     }
502:   }
503:   ViennaCLWaitForGPU();
504:   return(0);
505: }


508: PetscErrorCode VecDot_SeqViennaCL(Vec xin,Vec yin,PetscScalar *z)
509: {
510:   const ViennaCLVector  *xgpu,*ygpu;
511:   PetscErrorCode        ierr;

514:   if (xin->map->n > 0) {
515:     VecViennaCLGetArrayRead(xin,&xgpu);
516:     VecViennaCLGetArrayRead(yin,&ygpu);
517:     try {
518:       *z = viennacl::linalg::inner_prod(*xgpu,*ygpu);
519:     } catch(std::exception const & ex) {
520:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
521:     }
522:     if (xin->map->n >0) {
523:       PetscLogFlops(2.0*xin->map->n-1);
524:     }
525:     ViennaCLWaitForGPU();
526:     VecViennaCLRestoreArrayRead(xin,&xgpu);
527:     VecViennaCLRestoreArrayRead(yin,&ygpu);
528:   } else *z = 0.0;
529:   return(0);
530: }



534: /*
535:  * Operation z[j] = dot(x, y[j])
536:  *
537:  * We use an iterated application of dot() for each j. For small ranges of j this is still faster than an allocation of extra memory in order to use gemv().
538:  */
539: PetscErrorCode VecMDot_SeqViennaCL(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
540: {
541:   PetscErrorCode       ierr;
542:   PetscInt             n = xin->map->n,i;
543:   const ViennaCLVector *xgpu,*ygpu;
544:   Vec                  *yyin = (Vec*)yin;
545:   std::vector<viennacl::vector_base<PetscScalar> const *> ygpu_array(nv);

548:   if (xin->map->n > 0) {
549:     VecViennaCLGetArrayRead(xin,&xgpu);
550:     for (i=0; i<nv; i++) {
551:       VecViennaCLGetArrayRead(yyin[i],&ygpu);
552:       ygpu_array[i] = ygpu;
553:     }

555:     viennacl::vector_tuple<PetscScalar> y_tuple(ygpu_array);
556:     ViennaCLVector result = viennacl::linalg::inner_prod(*xgpu, y_tuple);

558:     viennacl::copy(result.begin(), result.end(), z);

560:     for (i=0; i<nv; i++) {
561:       VecViennaCLRestoreArrayRead(yyin[i],&ygpu);
562:     }

564:     ViennaCLWaitForGPU();
565:     VecViennaCLRestoreArrayRead(xin,&xgpu);
566:     PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
567:   } else {
568:     for (i=0; i<nv; i++) z[i] = 0.0;
569:   }
570:   return(0);
571: }

573: PetscErrorCode VecMTDot_SeqViennaCL(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
574: {

578:   /* Since complex case is not supported at the moment, this is the same as VecMDot_SeqViennaCL */
579:   VecMDot_SeqViennaCL(xin,nv,yin,z);
580:   ViennaCLWaitForGPU();
581:   return(0);
582: }


585: PetscErrorCode VecSet_SeqViennaCL(Vec xin,PetscScalar alpha)
586: {
587:   ViennaCLVector *xgpu;

591:   if (xin->map->n > 0) {
592:     VecViennaCLGetArrayWrite(xin,&xgpu);
593:     try {
594:       *xgpu = viennacl::scalar_vector<PetscScalar>(xgpu->size(), alpha);
595:       ViennaCLWaitForGPU();
596:     } catch(std::exception const & ex) {
597:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
598:     }
599:     VecViennaCLRestoreArrayWrite(xin,&xgpu);
600:   }
601:   return(0);
602: }

604: PetscErrorCode VecScale_SeqViennaCL(Vec xin, PetscScalar alpha)
605: {
606:   ViennaCLVector *xgpu;

610:   if (alpha == 0.0 && xin->map->n > 0) {
611:     VecSet_SeqViennaCL(xin,alpha);
612:     PetscLogFlops(xin->map->n);
613:   } else if (alpha != 1.0 && xin->map->n > 0) {
614:     VecViennaCLGetArray(xin,&xgpu);
615:     try {
616:       *xgpu *= alpha;
617:       ViennaCLWaitForGPU();
618:     } catch(std::exception const & ex) {
619:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
620:     }
621:     VecViennaCLRestoreArray(xin,&xgpu);
622:     PetscLogFlops(xin->map->n);
623:   }
624:   return(0);
625: }


628: PetscErrorCode VecTDot_SeqViennaCL(Vec xin,Vec yin,PetscScalar *z)
629: {

633:   /* Since complex case is not supported at the moment, this is the same as VecDot_SeqViennaCL */
634:   VecDot_SeqViennaCL(xin, yin, z);
635:   ViennaCLWaitForGPU();
636:   return(0);
637: }


640: PetscErrorCode VecCopy_SeqViennaCL(Vec xin,Vec yin)
641: {
642:   const ViennaCLVector *xgpu;
643:   ViennaCLVector       *ygpu;
644:   PetscErrorCode       ierr;

647:   if (xin != yin && xin->map->n > 0) {
648:     if (xin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
649:       VecViennaCLGetArrayRead(xin,&xgpu);
650:       VecViennaCLGetArrayWrite(yin,&ygpu);
651:       try {
652:         *ygpu = *xgpu;
653:         ViennaCLWaitForGPU();
654:       } catch(std::exception const & ex) {
655:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
656:       }
657:       VecViennaCLRestoreArrayRead(xin,&xgpu);
658:       VecViennaCLRestoreArrayWrite(yin,&ygpu);

660:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
661:       /* copy in CPU if we are on the CPU*/
662:       VecCopy_SeqViennaCL_Private(xin,yin);
663:       ViennaCLWaitForGPU();
664:     } else if (xin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
665:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
666:       if (yin->valid_GPU_array == PETSC_OFFLOAD_CPU) {
667:         /* copy in CPU */
668:         VecCopy_SeqViennaCL_Private(xin,yin);
669:         ViennaCLWaitForGPU();
670:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_GPU) {
671:         /* copy in GPU */
672:         VecViennaCLGetArrayRead(xin,&xgpu);
673:         VecViennaCLGetArrayWrite(yin,&ygpu);
674:         try {
675:           *ygpu = *xgpu;
676:           ViennaCLWaitForGPU();
677:         } catch(std::exception const & ex) {
678:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
679:         }
680:         VecViennaCLRestoreArrayRead(xin,&xgpu);
681:         VecViennaCLRestoreArrayWrite(yin,&ygpu);
682:       } else if (yin->valid_GPU_array == PETSC_OFFLOAD_BOTH) {
683:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
684:            default to copy in GPU (this is an arbitrary choice) */
685:         VecViennaCLGetArrayRead(xin,&xgpu);
686:         VecViennaCLGetArrayWrite(yin,&ygpu);
687:         try {
688:           *ygpu = *xgpu;
689:           ViennaCLWaitForGPU();
690:         } catch(std::exception const & ex) {
691:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
692:         }
693:         VecViennaCLRestoreArrayRead(xin,&xgpu);
694:         VecViennaCLRestoreArrayWrite(yin,&ygpu);
695:       } else {
696:         VecCopy_SeqViennaCL_Private(xin,yin);
697:         ViennaCLWaitForGPU();
698:       }
699:     }
700:   }
701:   return(0);
702: }


705: PetscErrorCode VecSwap_SeqViennaCL(Vec xin,Vec yin)
706: {
708:   ViennaCLVector *xgpu,*ygpu;

711:   if (xin != yin && xin->map->n > 0) {
712:     VecViennaCLGetArray(xin,&xgpu);
713:     VecViennaCLGetArray(yin,&ygpu);

715:     try {
716:       viennacl::swap(*xgpu, *ygpu);
717:       ViennaCLWaitForGPU();
718:     } catch(std::exception const & ex) {
719:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
720:     }
721:     VecViennaCLRestoreArray(xin,&xgpu);
722:     VecViennaCLRestoreArray(yin,&ygpu);
723:   }
724:   return(0);
725: }


728: // y = alpha * x + beta * y
729: PetscErrorCode VecAXPBY_SeqViennaCL(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
730: {
731:   PetscErrorCode       ierr;
732:   PetscScalar          a = alpha,b = beta;
733:   const ViennaCLVector *xgpu;
734:   ViennaCLVector       *ygpu;

737:   if (a == 0.0 && xin->map->n > 0) {
738:     VecScale_SeqViennaCL(yin,beta);
739:   } else if (b == 1.0 && xin->map->n > 0) {
740:     VecAXPY_SeqViennaCL(yin,alpha,xin);
741:   } else if (a == 1.0 && xin->map->n > 0) {
742:     VecAYPX_SeqViennaCL(yin,beta,xin);
743:   } else if (b == 0.0 && xin->map->n > 0) {
744:     VecViennaCLGetArrayRead(xin,&xgpu);
745:     VecViennaCLGetArray(yin,&ygpu);
746:     try {
747:       *ygpu = *xgpu * alpha;
748:       ViennaCLWaitForGPU();
749:     } catch(std::exception const & ex) {
750:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
751:     }
752:     PetscLogFlops(xin->map->n);
753:     VecViennaCLRestoreArrayRead(xin,&xgpu);
754:     VecViennaCLRestoreArray(yin,&ygpu);
755:   } else if (xin->map->n > 0) {
756:     VecViennaCLGetArrayRead(xin,&xgpu);
757:     VecViennaCLGetArray(yin,&ygpu);
758:     try {
759:       *ygpu = *xgpu * alpha + *ygpu * beta;
760:       ViennaCLWaitForGPU();
761:     } catch(std::exception const & ex) {
762:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
763:     }
764:     VecViennaCLRestoreArrayRead(xin,&xgpu);
765:     VecViennaCLRestoreArray(yin,&ygpu);
766:     PetscLogFlops(3.0*xin->map->n);
767:   }
768:   return(0);
769: }


772: /* operation  z = alpha * x + beta *y + gamma *z*/
773: PetscErrorCode VecAXPBYPCZ_SeqViennaCL(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
774: {
775:   PetscErrorCode       ierr;
776:   PetscInt             n = zin->map->n;
777:   const ViennaCLVector *xgpu,*ygpu;
778:   ViennaCLVector       *zgpu;

781:   VecViennaCLGetArrayRead(xin,&xgpu);
782:   VecViennaCLGetArrayRead(yin,&ygpu);
783:   VecViennaCLGetArray(zin,&zgpu);
784:   if (alpha == 0.0 && xin->map->n > 0) {
785:     try {
786:       if (beta == 0.0) {
787:         *zgpu = gamma * *zgpu;
788:         ViennaCLWaitForGPU();
789:         PetscLogFlops(1.0*n);
790:       } else if (gamma == 0.0) {
791:         *zgpu = beta * *ygpu;
792:         ViennaCLWaitForGPU();
793:         PetscLogFlops(1.0*n);
794:       } else {
795:         *zgpu = beta * *ygpu + gamma * *zgpu;
796:         ViennaCLWaitForGPU();
797:         PetscLogFlops(3.0*n);
798:       }
799:     } catch(std::exception const & ex) {
800:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
801:     }
802:     PetscLogFlops(3.0*n);
803:   } else if (beta == 0.0 && xin->map->n > 0) {
804:     try {
805:       if (gamma == 0.0) {
806:         *zgpu = alpha * *xgpu;
807:         ViennaCLWaitForGPU();
808:         PetscLogFlops(1.0*n);
809:       } else {
810:         *zgpu = alpha * *xgpu + gamma * *zgpu;
811:         ViennaCLWaitForGPU();
812:         PetscLogFlops(3.0*n);
813:       }
814:     } catch(std::exception const & ex) {
815:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
816:     }
817:   } else if (gamma == 0.0 && xin->map->n > 0) {
818:     try {
819:       *zgpu = alpha * *xgpu + beta * *ygpu;
820:       ViennaCLWaitForGPU();
821:     } catch(std::exception const & ex) {
822:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
823:     }
824:     PetscLogFlops(3.0*n);
825:   } else if (xin->map->n > 0) {
826:     try {
827:       /* Split operation into two steps. This is not completely ideal, but avoids temporaries (which are far worse) */
828:       if (gamma != 1.0)
829:         *zgpu *= gamma;
830:       *zgpu += alpha * *xgpu + beta * *ygpu;
831:       ViennaCLWaitForGPU();
832:     } catch(std::exception const & ex) {
833:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
834:     }
835:     VecViennaCLRestoreArray(zin,&zgpu);
836:     VecViennaCLRestoreArrayRead(xin,&xgpu);
837:     VecViennaCLRestoreArrayRead(yin,&ygpu);
838:     PetscLogFlops(5.0*n);
839:   }
840:   return(0);
841: }

843: PetscErrorCode VecPointwiseMult_SeqViennaCL(Vec win,Vec xin,Vec yin)
844: {
845:   PetscErrorCode       ierr;
846:   PetscInt             n = win->map->n;
847:   const ViennaCLVector *xgpu,*ygpu;
848:   ViennaCLVector       *wgpu;

851:   if (xin->map->n > 0) {
852:     VecViennaCLGetArrayRead(xin,&xgpu);
853:     VecViennaCLGetArrayRead(yin,&ygpu);
854:     VecViennaCLGetArray(win,&wgpu);
855:     try {
856:       *wgpu = viennacl::linalg::element_prod(*xgpu, *ygpu);
857:       ViennaCLWaitForGPU();
858:     } catch(std::exception const & ex) {
859:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
860:     }
861:     VecViennaCLRestoreArrayRead(xin,&xgpu);
862:     VecViennaCLRestoreArrayRead(yin,&ygpu);
863:     VecViennaCLRestoreArray(win,&wgpu);
864:     PetscLogFlops(n);
865:   }
866:   return(0);
867: }


870: PetscErrorCode VecNorm_SeqViennaCL(Vec xin,NormType type,PetscReal *z)
871: {
872:   PetscErrorCode       ierr;
873:   PetscInt             n = xin->map->n;
874:   PetscBLASInt         bn;
875:   const ViennaCLVector *xgpu;

878:   if (xin->map->n > 0) {
879:     PetscBLASIntCast(n,&bn);
880:     VecViennaCLGetArrayRead(xin,&xgpu);
881:     if (type == NORM_2 || type == NORM_FROBENIUS) {
882:       try {
883:         *z = viennacl::linalg::norm_2(*xgpu);
884:         ViennaCLWaitForGPU();
885:       } catch(std::exception const & ex) {
886:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
887:       }
888:       PetscLogFlops(PetscMax(2.0*n-1,0.0));
889:     } else if (type == NORM_INFINITY) {
890:       VecViennaCLGetArrayRead(xin,&xgpu);
891:       try {
892:         *z = viennacl::linalg::norm_inf(*xgpu);
893:         ViennaCLWaitForGPU();
894:       } catch(std::exception const & ex) {
895:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
896:       }
897:       VecViennaCLRestoreArrayRead(xin,&xgpu);
898:     } else if (type == NORM_1) {
899:       try {
900:         *z = viennacl::linalg::norm_1(*xgpu);
901:         ViennaCLWaitForGPU();
902:       } catch(std::exception const & ex) {
903:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
904:       }
905:       PetscLogFlops(PetscMax(n-1.0,0.0));
906:     } else if (type == NORM_1_AND_2) {
907:       try {
908:         *z     = viennacl::linalg::norm_1(*xgpu);
909:         *(z+1) = viennacl::linalg::norm_2(*xgpu);
910:         ViennaCLWaitForGPU();
911:       } catch(std::exception const & ex) {
912:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
913:       }
914:       PetscLogFlops(PetscMax(2.0*n-1,0.0));
915:       PetscLogFlops(PetscMax(n-1.0,0.0));
916:     }
917:     VecViennaCLRestoreArrayRead(xin,&xgpu);
918:   } else if (type == NORM_1_AND_2) {
919:     *z      = 0.0;
920:     *(z+1)  = 0.0;
921:   } else *z = 0.0;
922:   return(0);
923: }


926: PetscErrorCode VecSetRandom_SeqViennaCL(Vec xin,PetscRandom r)
927: {

931:   VecSetRandom_SeqViennaCL_Private(xin,r);
932:   xin->valid_GPU_array = PETSC_OFFLOAD_CPU;
933:   return(0);
934: }

936: PetscErrorCode VecResetArray_SeqViennaCL(Vec vin)
937: {

942:   VecViennaCLCopyFromGPU(vin);
943:   VecResetArray_SeqViennaCL_Private(vin);
944:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
945:   return(0);
946: }

948: PetscErrorCode VecPlaceArray_SeqViennaCL(Vec vin,const PetscScalar *a)
949: {

954:   VecViennaCLCopyFromGPU(vin);
955:   VecPlaceArray_Seq(vin,a);
956:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
957:   return(0);
958: }

960: PetscErrorCode VecReplaceArray_SeqViennaCL(Vec vin,const PetscScalar *a)
961: {

966:   VecViennaCLCopyFromGPU(vin);
967:   VecReplaceArray_Seq(vin,a);
968:   vin->valid_GPU_array = PETSC_OFFLOAD_CPU;
969:   return(0);
970: }


973: /*@
974:    VecCreateSeqViennaCL - Creates a standard, sequential array-style vector.

976:    Collective

978:    Input Parameter:
979: +  comm - the communicator, should be PETSC_COMM_SELF
980: -  n - the vector length

982:    Output Parameter:
983: .  V - the vector

985:    Notes:
986:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
987:    same type as an existing vector.

989:    Level: intermediate

991: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
992: @*/
993: PetscErrorCode  VecCreateSeqViennaCL(MPI_Comm comm,PetscInt n,Vec *v)
994: {

998:   VecCreate(comm,v);
999:   VecSetSizes(*v,n,n);
1000:   VecSetType(*v,VECSEQVIENNACL);
1001:   return(0);
1002: }


1005: /*  VecDotNorm2 - computes the inner product of two vectors and the 2-norm squared of the second vector
1006:  *
1007:  *  Simply reuses VecDot() and VecNorm(). Performance improvement through custom kernel (kernel generator) possible.
1008:  */
1009: PetscErrorCode VecDotNorm2_SeqViennaCL(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1010: {
1011:   PetscErrorCode                         ierr;

1014:   VecDot_SeqViennaCL(s,t,dp);
1015:   VecNorm_SeqViennaCL(t,NORM_2,nm);
1016:   *nm *= *nm; //squared norm required
1017:   return(0);
1018: }

1020: PetscErrorCode VecDuplicate_SeqViennaCL(Vec win,Vec *V)
1021: {

1025:   VecCreateSeqViennaCL(PetscObjectComm((PetscObject)win),win->map->n,V);
1026:   PetscLayoutReference(win->map,&(*V)->map);
1027:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1028:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1029:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1030:   return(0);
1031: }

1033: PetscErrorCode VecDestroy_SeqViennaCL(Vec v)
1034: {

1038:   try {
1039:     if (v->spptr) {
1040:       delete ((Vec_ViennaCL*)v->spptr)->GPUarray;
1041:       delete (Vec_ViennaCL*) v->spptr;
1042:     }
1043:   } catch(char *ex) {
1044:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex);
1045:   }
1046:   VecDestroy_SeqViennaCL_Private(v);
1047:   return(0);
1048: }

1050: static PetscErrorCode VecPinToCPU_SeqAIJViennaCL(Vec V,PetscBool flg)
1051: {

1055:   V->pinnedtocpu = flg;
1056:   if (flg) {
1057:     VecViennaCLCopyFromGPU(V);
1058:     V->valid_GPU_array = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
1059:     V->ops->dot             = VecDot_Seq;
1060:     V->ops->norm            = VecNorm_Seq;
1061:     V->ops->tdot            = VecTDot_Seq;
1062:     V->ops->scale           = VecScale_Seq;
1063:     V->ops->copy            = VecCopy_Seq;
1064:     V->ops->set             = VecSet_Seq;
1065:     V->ops->swap            = VecSwap_Seq;
1066:     V->ops->axpy            = VecAXPY_Seq;
1067:     V->ops->axpby           = VecAXPBY_Seq;
1068:     V->ops->axpbypcz        = VecAXPBYPCZ_Seq;
1069:     V->ops->pointwisemult   = VecPointwiseMult_Seq;
1070:     V->ops->pointwisedivide = VecPointwiseDivide_Seq;
1071:     V->ops->setrandom       = VecSetRandom_Seq;
1072:     V->ops->dot_local       = VecDot_Seq;
1073:     V->ops->tdot_local      = VecTDot_Seq;
1074:     V->ops->norm_local      = VecNorm_Seq;
1075:     V->ops->mdot_local      = VecMDot_Seq;
1076:     V->ops->mtdot_local     = VecMTDot_Seq;
1077:     V->ops->maxpy           = VecMAXPY_Seq;
1078:     V->ops->mdot            = VecMDot_Seq;
1079:     V->ops->mtdot           = VecMTDot_Seq;
1080:     V->ops->aypx            = VecAYPX_Seq;
1081:     V->ops->waxpy           = VecWAXPY_Seq;
1082:     V->ops->dotnorm2        = NULL;
1083:     V->ops->placearray      = VecPlaceArray_Seq;
1084:     V->ops->replacearray    = VecReplaceArray_Seq;
1085:     V->ops->resetarray      = VecResetArray_Seq;
1086:     V->ops->destroy         = VecDestroy_Seq;
1087:     V->ops->duplicate       = VecDuplicate_Seq;
1088:   } else {
1089:     V->ops->dot             = VecDot_SeqViennaCL;
1090:     V->ops->norm            = VecNorm_SeqViennaCL;
1091:     V->ops->tdot            = VecTDot_SeqViennaCL;
1092:     V->ops->scale           = VecScale_SeqViennaCL;
1093:     V->ops->copy            = VecCopy_SeqViennaCL;
1094:     V->ops->set             = VecSet_SeqViennaCL;
1095:     V->ops->swap            = VecSwap_SeqViennaCL;
1096:     V->ops->axpy            = VecAXPY_SeqViennaCL;
1097:     V->ops->axpby           = VecAXPBY_SeqViennaCL;
1098:     V->ops->axpbypcz        = VecAXPBYPCZ_SeqViennaCL;
1099:     V->ops->pointwisemult   = VecPointwiseMult_SeqViennaCL;
1100:     V->ops->pointwisedivide = VecPointwiseDivide_SeqViennaCL;
1101:     V->ops->setrandom       = VecSetRandom_SeqViennaCL;
1102:     V->ops->dot_local       = VecDot_SeqViennaCL;
1103:     V->ops->tdot_local      = VecTDot_SeqViennaCL;
1104:     V->ops->norm_local      = VecNorm_SeqViennaCL;
1105:     V->ops->mdot_local      = VecMDot_SeqViennaCL;
1106:     V->ops->mtdot_local     = VecMTDot_SeqViennaCL;
1107:     V->ops->maxpy           = VecMAXPY_SeqViennaCL;
1108:     V->ops->mdot            = VecMDot_SeqViennaCL;
1109:     V->ops->mtdot           = VecMTDot_SeqViennaCL;
1110:     V->ops->aypx            = VecAYPX_SeqViennaCL;
1111:     V->ops->waxpy           = VecWAXPY_SeqViennaCL;
1112:     V->ops->dotnorm2        = VecDotNorm2_SeqViennaCL;
1113:     V->ops->placearray      = VecPlaceArray_SeqViennaCL;
1114:     V->ops->replacearray    = VecReplaceArray_SeqViennaCL;
1115:     V->ops->resetarray      = VecResetArray_SeqViennaCL;
1116:     V->ops->destroy         = VecDestroy_SeqViennaCL;
1117:     V->ops->duplicate       = VecDuplicate_SeqViennaCL;
1118:   }
1119:   return(0);
1120: }

1122: PETSC_EXTERN PetscErrorCode VecCreate_SeqViennaCL(Vec V)
1123: {
1125:   PetscMPIInt    size;

1128:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
1129:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQVIENNACL on more than one process");
1130:   VecCreate_Seq_Private(V,0);
1131:   PetscObjectChangeTypeName((PetscObject)V,VECSEQVIENNACL);

1133:   VecPinToCPU_SeqAIJViennaCL(V,PETSC_FALSE);
1134:   V->ops->pintocpu = VecPinToCPU_SeqAIJViennaCL;

1136:   VecViennaCLAllocateCheck(V);
1137:   V->valid_GPU_array      = PETSC_OFFLOAD_GPU;
1138:   VecSet(V,0.0);
1139:   return(0);
1140: }