Actual source code: aijviennacl.cxx

petsc-master 2019-08-18
Report Typos and Errors


  3: /*
  4:     Defines the basic matrix operations for the AIJ (compressed row)
  5:   matrix storage format.
  6: */

  8: #include <petscconf.h>
  9:  #include <../src/mat/impls/aij/seq/aij.h>
 10:  #include <petscbt.h>
 11:  #include <../src/vec/vec/impls/dvecimpl.h>
 12:  #include <petsc/private/vecimpl.h>

 14:  #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>


 17: #include <algorithm>
 18: #include <vector>
 19: #include <string>

 21: #include "viennacl/linalg/prod.hpp"

 23: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat);
 24: PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);


 27: PetscErrorCode MatViennaCLCopyToGPU(Mat A)
 28: {

 30:   Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
 31:   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
 32:   PetscErrorCode     ierr;

 35:   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0
 36:     if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
 37:       PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);

 39:       try {
 40:         if (a->compressedrow.use) {
 41:           if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();

 43:           // Since PetscInt is different from cl_uint, we have to convert:
 44:           viennacl::backend::mem_handle dummy;

 46:           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
 47:           for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
 48:             row_buffer.set(i, (a->compressedrow.i)[i]);

 50:           viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
 51:           for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
 52:             row_indices.set(i, (a->compressedrow.rindex)[i]);

 54:           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
 55:           for (PetscInt i=0; i<a->nz; ++i)
 56:             col_buffer.set(i, (a->j)[i]);

 58:           viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
 59:           PetscLogCpuToGpu(((2*a->compressedrow.nrows)+1+a->nz)*sizeof(PetscInt) + (a->nz)*sizeof(PetscScalar));
 60:         } else {
 61:           if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();

 63:           // Since PetscInt is in general different from cl_uint, we have to convert:
 64:           viennacl::backend::mem_handle dummy;

 66:           viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
 67:           for (PetscInt i=0; i<=A->rmap->n; ++i)
 68:             row_buffer.set(i, (a->i)[i]);

 70:           viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
 71:           for (PetscInt i=0; i<a->nz; ++i)
 72:             col_buffer.set(i, (a->j)[i]);

 74:           viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
 75:           PetscLogCpuToGpu(((A->rmap->n+1)+a->nz)*sizeof(PetscInt)+(a->nz)*sizeof(PetscScalar));
 76:         }
 77:         ViennaCLWaitForGPU();
 78:       } catch(std::exception const & ex) {
 79:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
 80:       }

 82:       // Create temporary vector for v += A*x:
 83:       if (viennaclstruct->tempvec) {
 84:         if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) {
 85:           delete (ViennaCLVector*)viennaclstruct->tempvec;
 86:           viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
 87:         } else {
 88:           viennaclstruct->tempvec->clear();
 89:         }
 90:       } else {
 91:         viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n);
 92:       }

 94:       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;

 96:       PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);
 97:     }
 98:   }
 99:   return(0);
100: }

102: PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
103: {
104:   Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
105:   PetscInt           m               = A->rmap->n;
106:   PetscErrorCode     ierr;


110:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED) {
111:     try {
112:       if (a->compressedrow.use) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
113:       else {

115:         if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
116:         a->nz           = Agpu->nnz();
117:         a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
118:         A->preallocated = PETSC_TRUE;
119:         if (a->singlemalloc) {
120:           if (a->a) {PetscFree3(a->a,a->j,a->i);}
121:         } else {
122:           if (a->i) {PetscFree(a->i);}
123:           if (a->j) {PetscFree(a->j);}
124:           if (a->a) {PetscFree(a->a);}
125:         }
126:         PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);
127:         PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));

129:         a->singlemalloc = PETSC_TRUE;

131:         /* Setup row lengths */
132:         PetscFree(a->imax);
133:         PetscFree(a->ilen);
134:         PetscMalloc1(m,&a->imax);
135:         PetscMalloc1(m,&a->ilen);
136:         PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));

138:         /* Copy data back from GPU */
139:         viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);

141:         // copy row array
142:         viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
143:         (a->i)[0] = row_buffer[0];
144:         for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
145:           (a->i)[i+1] = row_buffer[i+1];
146:           a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
147:         }

149:         // copy column indices
150:         viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
151:         viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
152:         for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
153:           (a->j)[i] = col_buffer[i];

155:         // copy nonzero entries directly to destination (no conversion required)
156:         viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);

158:         PetscLogGpuToCpu(row_buffer.raw_size()+col_buffer.raw_size()+(Agpu->nnz()*sizeof(PetscScalar)));
159:         ViennaCLWaitForGPU();
160:         /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
161:       }
162:     } catch(std::exception const & ex) {
163:       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
164:     }

166:     /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */
167:     MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
168:     MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);

170:     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
171:   } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
172:   return(0);
173: }

175: PetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
176: {
177:   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
178:   PetscErrorCode       ierr;
179:   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
180:   const ViennaCLVector *xgpu=NULL;
181:   ViennaCLVector       *ygpu=NULL;

184:   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
185:     VecViennaCLGetArrayRead(xx,&xgpu);
186:     VecViennaCLGetArrayWrite(yy,&ygpu);
187:     PetscLogGpuTimeBegin();
188:     try {
189:       if (a->compressedrow.use) {
190:         *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
191:       } else {
192:         *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
193:       }
194:       ViennaCLWaitForGPU();
195:     } catch (std::exception const & ex) {
196:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
197:     }
198:     PetscLogGpuTimeEnd();
199:     VecViennaCLRestoreArrayRead(xx,&xgpu);
200:     VecViennaCLRestoreArrayWrite(yy,&ygpu);
201:     PetscLogGpuFlops(2.0*a->nz - a->nonzerorowcnt);
202:   } else {
203:     VecSet(yy,0);
204:   }
205:   return(0);
206: }

208: PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
209: {
210:   Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
211:   PetscErrorCode       ierr;
212:   Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
213:   const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
214:   ViennaCLVector       *zgpu=NULL;

217:   if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) {
218:     try {
219:       VecViennaCLGetArrayRead(xx,&xgpu);
220:       VecViennaCLGetArrayRead(yy,&ygpu);
221:       VecViennaCLGetArrayWrite(zz,&zgpu);
222:       PetscLogGpuTimeBegin();
223:       if (a->compressedrow.use) {
224:         ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
225:         *zgpu = *ygpu + temp;
226:         ViennaCLWaitForGPU();
227:       } else {
228:         if (zz == xx || zz == yy) { //temporary required
229:           ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
230:           *zgpu = *ygpu;
231:           *zgpu += temp;
232:           ViennaCLWaitForGPU();
233:         } else {
234:           *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
235:           *zgpu = *ygpu + *viennaclstruct->tempvec;
236:           ViennaCLWaitForGPU();
237:         }
238:       }
239:       PetscLogGpuTimeEnd();
240:       VecViennaCLRestoreArrayRead(xx,&xgpu);
241:       VecViennaCLRestoreArrayRead(yy,&ygpu);
242:       VecViennaCLRestoreArrayWrite(zz,&zgpu);

244:     } catch(std::exception const & ex) {
245:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
246:     }
247:     PetscLogGpuFlops(2.0*a->nz);
248:   } else {
249:     VecCopy(yy,zz);
250:   }
251:   return(0);
252: }

254: PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
255: {

259:   MatAssemblyEnd_SeqAIJ(A,mode);
260:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
261:   if (!A->pinnedtocpu) {
262:     MatViennaCLCopyToGPU(A);
263:   }
264:   return(0);
265: }

267: /* --------------------------------------------------------------------------------*/
268: /*@
269:    MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
270:    (the default parallel PETSc format).  This matrix will ultimately be pushed down
271:    to GPUs and use the ViennaCL library for calculations. For good matrix
272:    assembly performance the user should preallocate the matrix storage by setting
273:    the parameter nz (or the array nnz).  By setting these parameters accurately,
274:    performance during matrix assembly can be increased substantially.


277:    Collective

279:    Input Parameters:
280: +  comm - MPI communicator, set to PETSC_COMM_SELF
281: .  m - number of rows
282: .  n - number of columns
283: .  nz - number of nonzeros per row (same for all rows)
284: -  nnz - array containing the number of nonzeros in the various rows
285:          (possibly different for each row) or NULL

287:    Output Parameter:
288: .  A - the matrix

290:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
291:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
292:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

294:    Notes:
295:    If nnz is given then nz is ignored

297:    The AIJ format (also called the Yale sparse matrix format or
298:    compressed row storage), is fully compatible with standard Fortran 77
299:    storage.  That is, the stored row and column indices can begin at
300:    either one (as in Fortran) or zero.  See the users' manual for details.

302:    Specify the preallocated storage with either nz or nnz (not both).
303:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
304:    allocation.  For large problems you MUST preallocate memory or you
305:    will get TERRIBLE performance, see the users' manual chapter on matrices.

307:    Level: intermediate

309: .seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSPARSE(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()

311: @*/
312: PetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
313: {

317:   MatCreate(comm,A);
318:   MatSetSizes(*A,m,n,m,n);
319:   MatSetType(*A,MATSEQAIJVIENNACL);
320:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
321:   return(0);
322: }


325: PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
326: {
328:   Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;

331:   try {
332:     if (viennaclcontainer) {
333:       delete viennaclcontainer->tempvec;
334:       delete viennaclcontainer->mat;
335:       delete viennaclcontainer->compressed_mat;
336:       delete viennaclcontainer;
337:     }
338:     A->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
339:   } catch(std::exception const & ex) {
340:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
341:   }

343:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",NULL);

345:   /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
346:   A->spptr = 0;
347:   MatDestroy_SeqAIJ(A);
348:   return(0);
349: }


352: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
353: {

357:   MatCreate_SeqAIJ(B);
358:   MatConvert_SeqAIJ_SeqAIJViennaCL(B,MATSEQAIJVIENNACL,MAT_INPLACE_MATRIX,&B);
359:   return(0);
360: }

362: static PetscErrorCode MatPinToCPU_SeqAIJViennaCL(Mat,PetscBool);
363: static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A,MatDuplicateOption cpvalues,Mat *B)
364: {
366:   Mat            C;

369:   MatDuplicate_SeqAIJ(A,cpvalues,B);
370:   C = *B;

372:   MatPinToCPU_SeqAIJViennaCL(A,PETSC_FALSE);
373:   C->ops->pintocpu = MatPinToCPU_SeqAIJViennaCL;

375:   C->spptr        = new Mat_SeqAIJViennaCL();
376:   ((Mat_SeqAIJViennaCL*)C->spptr)->tempvec        = NULL;
377:   ((Mat_SeqAIJViennaCL*)C->spptr)->mat            = NULL;
378:   ((Mat_SeqAIJViennaCL*)C->spptr)->compressed_mat = NULL;

380:   PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJVIENNACL);

382:   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

384:   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
385:   if (C->assembled) {
386:     MatViennaCLCopyToGPU(C);
387:   }

389:   return(0);
390: }

392: static PetscErrorCode MatPinToCPU_SeqAIJViennaCL(Mat A,PetscBool flg)
393: {
395:   A->pinnedtocpu = flg;
396:   if (flg) {
397:     A->ops->mult        = MatMult_SeqAIJ;
398:     A->ops->multadd     = MatMultAdd_SeqAIJ;
399:     A->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
400:     A->ops->duplicate   = MatDuplicate_SeqAIJ;
401:   } else {
402:     A->ops->mult        = MatMult_SeqAIJViennaCL;
403:     A->ops->multadd     = MatMultAdd_SeqAIJViennaCL;
404:     A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL;
405:     A->ops->destroy     = MatDestroy_SeqAIJViennaCL;
406:     A->ops->duplicate   = MatDuplicate_SeqAIJViennaCL;
407:   }
408:   return(0);
409: }

411: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
412: {
414:   Mat            B;
415:   Mat_SeqAIJ     *aij;


419:   if (reuse == MAT_REUSE_MATRIX) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead");

421:   if (reuse == MAT_INITIAL_MATRIX) {
422:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
423:   }

425:   B = *newmat;

427:   aij             = (Mat_SeqAIJ*)B->data;
428:   aij->inode.use  = PETSC_FALSE;

430:   B->spptr        = new Mat_SeqAIJViennaCL();

432:   ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
433:   ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
434:   ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;

436:   MatPinToCPU_SeqAIJViennaCL(A,PETSC_FALSE);
437:   A->ops->pintocpu = MatPinToCPU_SeqAIJViennaCL;

439:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);
440:   PetscFree(B->defaultvectype);
441:   PetscStrallocpy(VECVIENNACL,&B->defaultvectype);

443:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaij_seqaijviennacl_C",MatConvert_SeqAIJ_SeqAIJViennaCL);

445:   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

447:   /* If the source matrix is already assembled, copy the destination matrix to the GPU */
448:   if (B->assembled) {
449:     MatViennaCLCopyToGPU(B);
450:   }

452:   return(0);
453: }


456: /*MC
457:    MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.

459:    A matrix type type whose data resides on GPUs. These matrices are in CSR format by
460:    default. All matrix calculations are performed using the ViennaCL library.

462:    Options Database Keys:
463: +  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
464: .  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
465: -  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().

467:   Level: beginner

469: .seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
470: M*/


473: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void)
474: {

478:   MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_LU,MatGetFactor_seqaij_petsc);
479:   MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_CHOLESKY,MatGetFactor_seqaij_petsc);
480:   MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_ILU,MatGetFactor_seqaij_petsc);
481:   MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL,    MAT_FACTOR_ICC,MatGetFactor_seqaij_petsc);
482:   return(0);
483: }