Actual source code: mpiaijcusparse.cu

petsc-master 2020-09-19
Report Typos and Errors
  1: #define PETSC_SKIP_SPINLOCK
  2: #define PETSC_SKIP_CXX_COMPLEX_FIX
  3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  5: #include <petscconf.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  8: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>

 10: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)

 12: PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
 13: {
 14:   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
 15:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
 16:   PetscErrorCode     ierr;
 17:   PetscInt           i;

 20:   PetscLayoutSetUp(B->rmap);
 21:   PetscLayoutSetUp(B->cmap);
 22:   if (d_nnz) {
 23:     for (i=0; i<B->rmap->n; i++) {
 24:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
 25:     }
 26:   }
 27:   if (o_nnz) {
 28:     for (i=0; i<B->rmap->n; i++) {
 29:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
 30:     }
 31:   }
 32:   if (!B->preallocated) {
 33:     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
 34:     MatCreate(PETSC_COMM_SELF,&b->A);
 35:     MatBindToCPU(b->A,B->boundtocpu);
 36:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
 37:     MatSetType(b->A,MATSEQAIJCUSPARSE);
 38:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
 39:     MatCreate(PETSC_COMM_SELF,&b->B);
 40:     MatBindToCPU(b->B,B->boundtocpu);
 41:     MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
 42:     MatSetType(b->B,MATSEQAIJCUSPARSE);
 43:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
 44:   }
 45:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
 46:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
 47:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
 48:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
 49:   MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
 50:   MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
 51:   MatCUSPARSESetStream(b->A,cusparseStruct->stream);
 52:   MatCUSPARSESetStream(b->B,cusparseStruct->stream);

 54:   B->preallocated = PETSC_TRUE;
 55:   return(0);
 56: }

 58: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
 59: {
 60:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 62:   PetscInt       nt;

 65:   VecGetLocalSize(xx,&nt);
 66:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
 67:   VecScatterInitializeForGPU(a->Mvctx,xx);
 68:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 69:   (*a->A->ops->mult)(a->A,xx,yy);
 70:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 71:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
 72:   VecScatterFinalizeForGPU(a->Mvctx);
 73:   return(0);
 74: }

 76: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
 77: {
 78:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 80:   PetscInt       nt;

 83:   VecGetLocalSize(xx,&nt);
 84:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
 85:   VecScatterInitializeForGPU(a->Mvctx,xx);
 86:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 87:   (*a->A->ops->multadd)(a->A,xx,yy,zz);
 88:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 89:   (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
 90:   VecScatterFinalizeForGPU(a->Mvctx);
 91:   return(0);
 92: }

 94: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
 95: {
 96:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 98:   PetscInt       nt;

101:   VecGetLocalSize(xx,&nt);
102:   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
103:   VecScatterInitializeForGPU(a->Mvctx,a->lvec);
104:   (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
105:   (*a->A->ops->multtranspose)(a->A,xx,yy);
106:   VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
107:   VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
108:   VecScatterFinalizeForGPU(a->Mvctx);
109:   return(0);
110: }

112: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
113: {
114:   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
115:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

118:   switch (op) {
119:   case MAT_CUSPARSE_MULT_DIAG:
120:     cusparseStruct->diagGPUMatFormat = format;
121:     break;
122:   case MAT_CUSPARSE_MULT_OFFDIAG:
123:     cusparseStruct->offdiagGPUMatFormat = format;
124:     break;
125:   case MAT_CUSPARSE_ALL:
126:     cusparseStruct->diagGPUMatFormat    = format;
127:     cusparseStruct->offdiagGPUMatFormat = format;
128:     break;
129:   default:
130:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
131:   }
132:   return(0);
133: }

135: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
136: {
137:   MatCUSPARSEStorageFormat format;
138:   PetscErrorCode           ierr;
139:   PetscBool                flg;
140:   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
141:   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

144:   PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
145:   if (A->factortype==MAT_FACTOR_NONE) {
146:     PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
147:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
148:     if (flg) {
149:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
150:     }
151:     PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
152:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
153:     if (flg) {
154:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
155:     }
156:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
157:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
158:     if (flg) {
159:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
160:     }
161:   }
162:   PetscOptionsTail();
163:   return(0);
164: }

166: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
167: {
169:   Mat_MPIAIJ     *mpiaij;

172:   mpiaij = (Mat_MPIAIJ*)A->data;
173:   MatAssemblyEnd_MPIAIJ(A,mode);
174:   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
175:     VecSetType(mpiaij->lvec,VECSEQCUDA);
176:   }
177:   return(0);
178: }

180: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
181: {
182:   PetscErrorCode     ierr;
183:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
184:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
185:   cudaError_t        err;
186:   cusparseStatus_t   stat;

189:   try {
190:     if (a->A) { MatCUSPARSEClearHandle(a->A); }
191:     if (a->B) { MatCUSPARSEClearHandle(a->B); }
192:     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
193:     if (cusparseStruct->stream) {
194:       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
195:     }
196:     delete cusparseStruct;
197:   } catch(char *ex) {
198:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
199:   }
200:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
201:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
202:   MatDestroy_MPIAIJ(A);
203:   return(0);
204: }

206: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
207: {
208:   PetscErrorCode     ierr;
209:   Mat_MPIAIJ         *a;
210:   Mat_MPIAIJCUSPARSE *cusparseStruct;
211:   cusparseStatus_t   stat;
212:   Mat                A;

215:   if (reuse == MAT_INITIAL_MATRIX) {
216:     MatDuplicate(B,MAT_COPY_VALUES,newmat);
217:   } else if (reuse == MAT_REUSE_MATRIX) {
218:     MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
219:   }
220:   A = *newmat;

222:   PetscFree(A->defaultvectype);
223:   PetscStrallocpy(VECCUDA,&A->defaultvectype);

225:   a = (Mat_MPIAIJ*)A->data;
226:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
227:     a->spptr = new Mat_MPIAIJCUSPARSE;

229:     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
230:     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
231:     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
232:     cusparseStruct->stream              = 0;
233:     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
234:   }

236:   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
237:   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
238:   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
239:   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
240:   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
241:   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;

243:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
244:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
245:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
246:   return(0);
247: }

249: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
250: {

254:   PetscCUDAInitializeCheck();
255:   MatCreate_MPIAIJ(A);
256:   MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
257:   return(0);
258: }

260: /*@
261:    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
262:    (the default parallel PETSc format).  This matrix will ultimately pushed down
263:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
264:    assembly performance the user should preallocate the matrix storage by setting
265:    the parameter nz (or the array nnz).  By setting these parameters accurately,
266:    performance during matrix assembly can be increased by more than a factor of 50.

268:    Collective

270:    Input Parameters:
271: +  comm - MPI communicator, set to PETSC_COMM_SELF
272: .  m - number of rows
273: .  n - number of columns
274: .  nz - number of nonzeros per row (same for all rows)
275: -  nnz - array containing the number of nonzeros in the various rows
276:          (possibly different for each row) or NULL

278:    Output Parameter:
279: .  A - the matrix

281:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
282:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
283:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

285:    Notes:
286:    If nnz is given then nz is ignored

288:    The AIJ format (also called the Yale sparse matrix format or
289:    compressed row storage), is fully compatible with standard Fortran 77
290:    storage.  That is, the stored row and column indices can begin at
291:    either one (as in Fortran) or zero.  See the users' manual for details.

293:    Specify the preallocated storage with either nz or nnz (not both).
294:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
295:    allocation.  For large problems you MUST preallocate memory or you
296:    will get TERRIBLE performance, see the users' manual chapter on matrices.

298:    By default, this format uses inodes (identical nodes) when possible, to
299:    improve numerical efficiency of matrix-vector products and solves. We
300:    search for consecutive rows with the same nonzero structure, thereby
301:    reusing matrix information to achieve increased efficiency.

303:    Level: intermediate

305: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
306: @*/
307: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
308: {
310:   PetscMPIInt    size;

313:   MatCreate(comm,A);
314:   MatSetSizes(*A,m,n,M,N);
315:   MPI_Comm_size(comm,&size);
316:   if (size > 1) {
317:     MatSetType(*A,MATMPIAIJCUSPARSE);
318:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
319:   } else {
320:     MatSetType(*A,MATSEQAIJCUSPARSE);
321:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
322:   }
323:   return(0);
324: }

326: /*MC
327:    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.

329:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
330:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
331:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

333:    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
334:    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
335:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
336:    for communicators controlling multiple processes.  It is recommended that you call both of
337:    the above preallocation routines for simplicity.

339:    Options Database Keys:
340: +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
341: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
342: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
343: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

345:   Level: beginner

347:  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
348: M
349: M*/
350: #else

352: /* The following stubs are only provided to satisfy the linker */

354: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
355: {
356:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!");
357: }
358: #endif