Actual source code: aijcusparse.cu

petsc-main 2021-04-20
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10: #include <../src/mat/impls/aij/seq/aij.h>
 11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12: #include <../src/vec/vec/impls/dvecimpl.h>
 13: #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 16: #include <thrust/async/for_each.h>

 18: const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
 19: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 20:   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 21:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 23:   typedef enum {
 24:       CUSPARSE_MV_ALG_DEFAULT = 0,
 25:       CUSPARSE_COOMV_ALG      = 1,
 26:       CUSPARSE_CSRMV_ALG1     = 2,
 27:       CUSPARSE_CSRMV_ALG2     = 3
 28:   } cusparseSpMVAlg_t;

 30:   typedef enum {
 31:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 32:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 33:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 34:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 35:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 36:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 37:       CUSPARSE_SPMM_COO_ALG1    = 1,
 38:       CUSPARSE_SPMM_COO_ALG2    = 2,
 39:       CUSPARSE_SPMM_COO_ALG3    = 3,
 40:       CUSPARSE_SPMM_COO_ALG4    = 5,
 41:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 42:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 43:   } cusparseSpMMAlg_t;

 45:   typedef enum {
 46:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 47:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 48:   } cusparseCsr2CscAlg_t;
 49:   */
 50:   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
 51:   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
 52:   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
 53: #endif

 55: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 56: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 57: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 59: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 60: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 61: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 64: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 66: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 67: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 68: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
 69: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
 70: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 71: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 72: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 73: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 74: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 75: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 76: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 78: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 79: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 80: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 81: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 82: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 84: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
 85: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 86: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);

 88: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
 89: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);

 91: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);

 93: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 94: {
 95:   cusparseStatus_t   stat;
 96:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 99:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
100:   cusparsestruct->stream = stream;
101:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
102:   return(0);
103: }

105: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
106: {
107:   cusparseStatus_t   stat;
108:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

111:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
112:   if (cusparsestruct->handle != handle) {
113:     if (cusparsestruct->handle) {
114:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
115:     }
116:     cusparsestruct->handle = handle;
117:   }
118:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
119:   return(0);
120: }

122: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123: {
124:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125:   PetscBool          flg;
126:   PetscErrorCode     ierr;

129:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
130:   if (!flg || !cusparsestruct) return(0);
131:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
132:   return(0);
133: }

135: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
136: {
138:   *type = MATSOLVERCUSPARSE;
139:   return(0);
140: }

142: /*MC
143:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
144:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
145:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
146:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
147:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
148:   algorithms are not recommended. This class does NOT support direct solver operations.

150:   Level: beginner

152: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
153: M*/

155: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
156: {
158:   PetscInt       n = A->rmap->n;

161:   MatCreate(PetscObjectComm((PetscObject)A),B);
162:   MatSetSizes(*B,n,n,n,n);
163:   (*B)->factortype = ftype;
164:   MatSetType(*B,MATSEQAIJCUSPARSE);

166:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
167:     MatSetBlockSizesFromMats(*B,A,A);
168:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
169:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
170:     PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
171:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);
172:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
173:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
174:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
175:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
176:     PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
177:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);
178:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

180:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
181:   (*B)->canuseordering = PETSC_TRUE;
182:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
183:   return(0);
184: }

186: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187: {
188:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

191:   switch (op) {
192:   case MAT_CUSPARSE_MULT:
193:     cusparsestruct->format = format;
194:     break;
195:   case MAT_CUSPARSE_ALL:
196:     cusparsestruct->format = format;
197:     break;
198:   default:
199:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200:   }
201:   return(0);
202: }

204: /*@
205:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206:    operation. Only the MatMult operation can use different GPU storage formats
207:    for MPIAIJCUSPARSE matrices.
208:    Not Collective

210:    Input Parameters:
211: +  A - Matrix of type SEQAIJCUSPARSE
212: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
213: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

215:    Output Parameter:

217:    Level: intermediate

219: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220: @*/
221: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222: {

227:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
228:   return(0);
229: }

231: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232: {

236:   switch (op) {
237:     case MAT_FORM_EXPLICIT_TRANSPOSE:
238:       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
239:       if (A->form_explicit_transpose && !flg) {MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);}
240:       A->form_explicit_transpose = flg;
241:       break;
242:     default:
243:       MatSetOption_SeqAIJ(A,op,flg);
244:       break;
245:   }
246:   return(0);
247: }

249: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);

251: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252: {
253:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
254:   IS             isrow = b->row,iscol = b->col;
255:   PetscBool      row_identity,col_identity;

259:   MatSeqAIJCUSPARSECopyFromGPU(A);
260:   MatLUFactorNumeric_SeqAIJ(B,A,info);
261:   B->offloadmask = PETSC_OFFLOAD_CPU;
262:   /* determine which version of MatSolve needs to be used. */
263:   ISIdentity(isrow,&row_identity);
264:   ISIdentity(iscol,&col_identity);
265:   if (row_identity && col_identity) {
266:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268:     B->ops->matsolve = NULL;
269:     B->ops->matsolvetranspose = NULL;
270:   } else {
271:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273:     B->ops->matsolve = NULL;
274:     B->ops->matsolvetranspose = NULL;
275:   }

277:   /* get the triangular factors */
278:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
279:   return(0);
280: }

282: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
283: {
284:   PetscErrorCode           ierr;
285:   MatCUSPARSEStorageFormat format;
286:   PetscBool                flg;
287:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

290:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
291:   if (A->factortype == MAT_FACTOR_NONE) {
292:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
294:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}

296:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
298:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
299:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300:     PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301:                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
302:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303:     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");

305:     PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306:                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
307:     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

309:     PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310:                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
311:     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312:    #endif
313:   }
314:   PetscOptionsTail();
315:   return(0);
316: }

318: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
319: {
320:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
321:   PetscErrorCode               ierr;

324:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
325:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
326:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
327:   return(0);
328: }

330: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331: {
332:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333:   PetscErrorCode               ierr;

336:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
337:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
338:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
339:   return(0);
340: }

342: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343: {
344:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345:   PetscErrorCode               ierr;

348:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
349:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
350:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351:   return(0);
352: }

354: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355: {
356:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357:   PetscErrorCode               ierr;

360:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
361:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
362:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363:   return(0);
364: }

366: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
367: {
368:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369:   PetscInt                          n = A->rmap->n;
370:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
372:   cusparseStatus_t                  stat;
373:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
374:   const MatScalar                   *aa = a->a,*v;
375:   PetscInt                          *AiLo, *AjLo;
376:   PetscInt                          i,nz, nzLower, offset, rowOffset;
377:   PetscErrorCode                    ierr;
378:   cudaError_t                       cerr;

381:   if (!n) return(0);
382:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
383:     try {
384:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
385:       nzLower=n+ai[n]-ai[1];
386:       if (!loTriFactor) {
387:         PetscScalar                       *AALo;

389:         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

391:         /* Allocate Space for the lower triangular matrix */
392:         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
393:         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);

395:         /* Fill the lower triangular matrix */
396:         AiLo[0]  = (PetscInt) 0;
397:         AiLo[n]  = nzLower;
398:         AjLo[0]  = (PetscInt) 0;
399:         AALo[0]  = (MatScalar) 1.0;
400:         v        = aa;
401:         vi       = aj;
402:         offset   = 1;
403:         rowOffset= 1;
404:         for (i=1; i<n; i++) {
405:           nz = ai[i+1] - ai[i];
406:           /* additional 1 for the term on the diagonal */
407:           AiLo[i]    = rowOffset;
408:           rowOffset += nz+1;

410:           PetscArraycpy(&(AjLo[offset]), vi, nz);
411:           PetscArraycpy(&(AALo[offset]), v, nz);

413:           offset      += nz;
414:           AjLo[offset] = (PetscInt) i;
415:           AALo[offset] = (MatScalar) 1.0;
416:           offset      += 1;

418:           v  += nz;
419:           vi += nz;
420:         }

422:         /* allocate space for the triangular factor information */
423:         PetscNew(&loTriFactor);
424:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425:         /* Create the matrix description */
426:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
427:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
428:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430:        #else
431:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432:        #endif
433:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
434:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

436:         /* set the operation */
437:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

439:         /* set the matrix */
440:         loTriFactor->csrMat = new CsrMatrix;
441:         loTriFactor->csrMat->num_rows = n;
442:         loTriFactor->csrMat->num_cols = n;
443:         loTriFactor->csrMat->num_entries = nzLower;

445:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

448:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

451:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452:         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

454:         /* Create the solve analysis information */
455:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
456:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
457:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464:       #endif

466:         /* perform the solve analysis */
467:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
471:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472:                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473:                                #endif
474: );CHKERRCUSPARSE(stat);
475:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
476:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

478:         /* assign the pointer */
479:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
480:         loTriFactor->AA_h = AALo;
481:         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
482:         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
483:         PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
484:       } else { /* update values only */
485:         if (!loTriFactor->AA_h) {
486:           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
487:         }
488:         /* Fill the lower triangular matrix */
489:         loTriFactor->AA_h[0]  = 1.0;
490:         v        = aa;
491:         vi       = aj;
492:         offset   = 1;
493:         for (i=1; i<n; i++) {
494:           nz = ai[i+1] - ai[i];
495:           PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
496:           offset      += nz;
497:           loTriFactor->AA_h[offset] = 1.0;
498:           offset      += 1;
499:           v  += nz;
500:         }
501:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502:         PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
503:       }
504:     } catch(char *ex) {
505:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
506:     }
507:   }
508:   return(0);
509: }

511: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
512: {
513:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
514:   PetscInt                          n = A->rmap->n;
515:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
517:   cusparseStatus_t                  stat;
518:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
519:   const MatScalar                   *aa = a->a,*v;
520:   PetscInt                          *AiUp, *AjUp;
521:   PetscInt                          i,nz, nzUpper, offset;
522:   PetscErrorCode                    ierr;
523:   cudaError_t                       cerr;

526:   if (!n) return(0);
527:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528:     try {
529:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
530:       nzUpper = adiag[0]-adiag[n];
531:       if (!upTriFactor) {
532:         PetscScalar *AAUp;

534:         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

536:         /* Allocate Space for the upper triangular matrix */
537:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
538:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

540:         /* Fill the upper triangular matrix */
541:         AiUp[0]=(PetscInt) 0;
542:         AiUp[n]=nzUpper;
543:         offset = nzUpper;
544:         for (i=n-1; i>=0; i--) {
545:           v  = aa + adiag[i+1] + 1;
546:           vi = aj + adiag[i+1] + 1;

548:           /* number of elements NOT on the diagonal */
549:           nz = adiag[i] - adiag[i+1]-1;

551:           /* decrement the offset */
552:           offset -= (nz+1);

554:           /* first, set the diagonal elements */
555:           AjUp[offset] = (PetscInt) i;
556:           AAUp[offset] = (MatScalar)1./v[nz];
557:           AiUp[i]      = AiUp[i+1] - (nz+1);

559:           PetscArraycpy(&(AjUp[offset+1]), vi, nz);
560:           PetscArraycpy(&(AAUp[offset+1]), v, nz);
561:         }

563:         /* allocate space for the triangular factor information */
564:         PetscNew(&upTriFactor);
565:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

567:         /* Create the matrix description */
568:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
569:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
570:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572:        #else
573:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574:        #endif
575:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
576:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

578:         /* set the operation */
579:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

581:         /* set the matrix */
582:         upTriFactor->csrMat = new CsrMatrix;
583:         upTriFactor->csrMat->num_rows = n;
584:         upTriFactor->csrMat->num_cols = n;
585:         upTriFactor->csrMat->num_entries = nzUpper;

587:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

590:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

593:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594:         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

596:         /* Create the solve analysis information */
597:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
598:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601:                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602:                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603:                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604:                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606:       #endif

608:         /* perform the solve analysis */
609:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
613:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614:                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615:                                #endif
616: );CHKERRCUSPARSE(stat);
617:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
618:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

620:         /* assign the pointer */
621:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
622:         upTriFactor->AA_h = AAUp;
623:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
624:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
625:         PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
626:       } else {
627:         if (!upTriFactor->AA_h) {
628:           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
629:         }
630:         /* Fill the upper triangular matrix */
631:         offset = nzUpper;
632:         for (i=n-1; i>=0; i--) {
633:           v  = aa + adiag[i+1] + 1;

635:           /* number of elements NOT on the diagonal */
636:           nz = adiag[i] - adiag[i+1]-1;

638:           /* decrement the offset */
639:           offset -= (nz+1);

641:           /* first, set the diagonal elements */
642:           upTriFactor->AA_h[offset] = 1./v[nz];
643:           PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
644:         }
645:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646:         PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
647:       }
648:     } catch(char *ex) {
649:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650:     }
651:   }
652:   return(0);
653: }

655: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
656: {
657:   PetscErrorCode               ierr;
658:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
659:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
660:   IS                           isrow = a->row,iscol = a->icol;
661:   PetscBool                    row_identity,col_identity;
662:   PetscInt                     n = A->rmap->n;

665:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
667:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

669:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670:   cusparseTriFactors->nnz=a->nz;

672:   A->offloadmask = PETSC_OFFLOAD_BOTH;
673:   /* lower triangular indices */
674:   ISIdentity(isrow,&row_identity);
675:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
676:     const PetscInt *r;

678:     ISGetIndices(isrow,&r);
679:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680:     cusparseTriFactors->rpermIndices->assign(r, r+n);
681:     ISRestoreIndices(isrow,&r);
682:     PetscLogCpuToGpu(n*sizeof(PetscInt));
683:   }

685:   /* upper triangular indices */
686:   ISIdentity(iscol,&col_identity);
687:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
688:     const PetscInt *c;

690:     ISGetIndices(iscol,&c);
691:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692:     cusparseTriFactors->cpermIndices->assign(c, c+n);
693:     ISRestoreIndices(iscol,&c);
694:     PetscLogCpuToGpu(n*sizeof(PetscInt));
695:   }
696:   return(0);
697: }

699: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700: {
701:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
702:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705:   cusparseStatus_t                  stat;
706:   PetscErrorCode                    ierr;
707:   cudaError_t                       cerr;
708:   PetscInt                          *AiUp, *AjUp;
709:   PetscScalar                       *AAUp;
710:   PetscScalar                       *AALo;
711:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
713:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
714:   const MatScalar                   *aa = b->a,*v;

717:   if (!n) return(0);
718:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719:     try {
720:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722:       if (!upTriFactor && !loTriFactor) {
723:         /* Allocate Space for the upper triangular matrix */
724:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
725:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

727:         /* Fill the upper triangular matrix */
728:         AiUp[0]=(PetscInt) 0;
729:         AiUp[n]=nzUpper;
730:         offset = 0;
731:         for (i=0; i<n; i++) {
732:           /* set the pointers */
733:           v  = aa + ai[i];
734:           vj = aj + ai[i];
735:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

737:           /* first, set the diagonal elements */
738:           AjUp[offset] = (PetscInt) i;
739:           AAUp[offset] = (MatScalar)1.0/v[nz];
740:           AiUp[i]      = offset;
741:           AALo[offset] = (MatScalar)1.0/v[nz];

743:           offset+=1;
744:           if (nz>0) {
745:             PetscArraycpy(&(AjUp[offset]), vj, nz);
746:             PetscArraycpy(&(AAUp[offset]), v, nz);
747:             for (j=offset; j<offset+nz; j++) {
748:               AAUp[j] = -AAUp[j];
749:               AALo[j] = AAUp[j]/v[nz];
750:             }
751:             offset+=nz;
752:           }
753:         }

755:         /* allocate space for the triangular factor information */
756:         PetscNew(&upTriFactor);
757:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

759:         /* Create the matrix description */
760:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
761:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
762:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764:        #else
765:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766:        #endif
767:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
768:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

770:         /* set the matrix */
771:         upTriFactor->csrMat = new CsrMatrix;
772:         upTriFactor->csrMat->num_rows = A->rmap->n;
773:         upTriFactor->csrMat->num_cols = A->cmap->n;
774:         upTriFactor->csrMat->num_entries = a->nz;

776:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

779:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

782:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

785:         /* set the operation */
786:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

788:         /* Create the solve analysis information */
789:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
790:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
791:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793:                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794:                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795:                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796:                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798:       #endif

800:         /* perform the solve analysis */
801:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
805:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806:                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807:                                 #endif
808: );CHKERRCUSPARSE(stat);
809:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
810:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

812:         /* assign the pointer */
813:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

815:         /* allocate space for the triangular factor information */
816:         PetscNew(&loTriFactor);
817:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

819:         /* Create the matrix description */
820:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
821:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
822:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824:        #else
825:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826:        #endif
827:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
828:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

830:         /* set the operation */
831:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

833:         /* set the matrix */
834:         loTriFactor->csrMat = new CsrMatrix;
835:         loTriFactor->csrMat->num_rows = A->rmap->n;
836:         loTriFactor->csrMat->num_cols = A->cmap->n;
837:         loTriFactor->csrMat->num_entries = a->nz;

839:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

842:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

845:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

848:         /* Create the solve analysis information */
849:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
850:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
851:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858:       #endif

860:         /* perform the solve analysis */
861:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
865:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866:                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867:                                 #endif
868: );CHKERRCUSPARSE(stat);
869:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
870:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

872:         /* assign the pointer */
873:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

875:         PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
876:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
877:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878:       } else {
879:         /* Fill the upper triangular matrix */
880:         offset = 0;
881:         for (i=0; i<n; i++) {
882:           /* set the pointers */
883:           v  = aa + ai[i];
884:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

886:           /* first, set the diagonal elements */
887:           AAUp[offset] = 1.0/v[nz];
888:           AALo[offset] = 1.0/v[nz];

890:           offset+=1;
891:           if (nz>0) {
892:             PetscArraycpy(&(AAUp[offset]), v, nz);
893:             for (j=offset; j<offset+nz; j++) {
894:               AAUp[j] = -AAUp[j];
895:               AALo[j] = AAUp[j]/v[nz];
896:             }
897:             offset+=nz;
898:           }
899:         }
900:         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901:         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904:         PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
905:       }
906:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
907:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908:     } catch(char *ex) {
909:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910:     }
911:   }
912:   return(0);
913: }

915: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
916: {
917:   PetscErrorCode               ierr;
918:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
919:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920:   IS                           ip = a->row;
921:   PetscBool                    perm_identity;
922:   PetscInt                     n = A->rmap->n;

925:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
927:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

930:   A->offloadmask = PETSC_OFFLOAD_BOTH;

932:   /* lower triangular indices */
933:   ISIdentity(ip,&perm_identity);
934:   if (!perm_identity) {
935:     IS             iip;
936:     const PetscInt *irip,*rip;

938:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
939:     ISGetIndices(iip,&irip);
940:     ISGetIndices(ip,&rip);
941:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
944:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
945:     ISRestoreIndices(iip,&irip);
946:     ISDestroy(&iip);
947:     ISRestoreIndices(ip,&rip);
948:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
949:   }
950:   return(0);
951: }

953: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
954: {
955:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
956:   IS             ip = b->row;
957:   PetscBool      perm_identity;

961:   MatSeqAIJCUSPARSECopyFromGPU(A);
962:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
963:   B->offloadmask = PETSC_OFFLOAD_CPU;
964:   /* determine which version of MatSolve needs to be used. */
965:   ISIdentity(ip,&perm_identity);
966:   if (perm_identity) {
967:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
968:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
969:     B->ops->matsolve = NULL;
970:     B->ops->matsolvetranspose = NULL;
971:   } else {
972:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
973:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
974:     B->ops->matsolve = NULL;
975:     B->ops->matsolvetranspose = NULL;
976:   }

978:   /* get the triangular factors */
979:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
980:   return(0);
981: }

983: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
984: {
985:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
986:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
987:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
988:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
989:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
990:   cusparseStatus_t                  stat;
991:   cusparseIndexBase_t               indexBase;
992:   cusparseMatrixType_t              matrixType;
993:   cusparseFillMode_t                fillMode;
994:   cusparseDiagType_t                diagType;
995:   cudaError_t                       cerr;
996:   PetscErrorCode                    ierr;

999:   /* allocate space for the transpose of the lower triangular factor */
1000:   PetscNew(&loTriFactorT);
1001:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1003:   /* set the matrix descriptors of the lower triangular factor */
1004:   matrixType = cusparseGetMatType(loTriFactor->descr);
1005:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1006:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1007:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1008:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

1010:   /* Create the matrix description */
1011:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1012:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1013:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1014:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1015:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1017:   /* set the operation */
1018:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1020:   /* allocate GPU space for the CSC of the lower triangular factor*/
1021:   loTriFactorT->csrMat = new CsrMatrix;
1022:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1023:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1024:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1025:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1026:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1027:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1029:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1030: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1031:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1032:                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1033:                                        loTriFactor->csrMat->values->data().get(),
1034:                                        loTriFactor->csrMat->row_offsets->data().get(),
1035:                                        loTriFactor->csrMat->column_indices->data().get(),
1036:                                        loTriFactorT->csrMat->values->data().get(),
1037:                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1038:                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1039:                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1040:   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1041: #endif

1043:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1044:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1045:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1046:                           loTriFactor->csrMat->values->data().get(),
1047:                           loTriFactor->csrMat->row_offsets->data().get(),
1048:                           loTriFactor->csrMat->column_indices->data().get(),
1049:                           loTriFactorT->csrMat->values->data().get(),
1050:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1051:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1053:                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1054:                         #else
1055:                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1056:                           CUSPARSE_ACTION_NUMERIC, indexBase
1057:                         #endif
1058: );CHKERRCUSPARSE(stat);
1059:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1060:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1062:   /* Create the solve analysis information */
1063:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1064:   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1065: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1066:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1068:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1069:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1070:                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1071:   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1072: #endif

1074:   /* perform the solve analysis */
1075:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078:                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1079:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080:                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1081:                           #endif
1082: );CHKERRCUSPARSE(stat);
1083:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1084:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1086:   /* assign the pointer */
1087:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1089:   /*********************************************/
1090:   /* Now the Transpose of the Upper Tri Factor */
1091:   /*********************************************/

1093:   /* allocate space for the transpose of the upper triangular factor */
1094:   PetscNew(&upTriFactorT);
1095:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1097:   /* set the matrix descriptors of the upper triangular factor */
1098:   matrixType = cusparseGetMatType(upTriFactor->descr);
1099:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1100:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1101:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1102:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

1104:   /* Create the matrix description */
1105:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1106:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1107:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1108:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1109:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1111:   /* set the operation */
1112:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1114:   /* allocate GPU space for the CSC of the upper triangular factor*/
1115:   upTriFactorT->csrMat = new CsrMatrix;
1116:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1117:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1118:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1119:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1120:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1121:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1123:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1124: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1126:                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1127:                                 upTriFactor->csrMat->values->data().get(),
1128:                                 upTriFactor->csrMat->row_offsets->data().get(),
1129:                                 upTriFactor->csrMat->column_indices->data().get(),
1130:                                 upTriFactorT->csrMat->values->data().get(),
1131:                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1132:                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1133:                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134:   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1135: #endif

1137:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1138:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1139:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1140:                           upTriFactor->csrMat->values->data().get(),
1141:                           upTriFactor->csrMat->row_offsets->data().get(),
1142:                           upTriFactor->csrMat->column_indices->data().get(),
1143:                           upTriFactorT->csrMat->values->data().get(),
1144:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1145:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1147:                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1148:                         #else
1149:                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1150:                           CUSPARSE_ACTION_NUMERIC, indexBase
1151:                         #endif
1152: );CHKERRCUSPARSE(stat);
1153:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1154:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1156:   /* Create the solve analysis information */
1157:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1158:   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1159:   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1160:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1161:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1162:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1163:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1164:                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1165:   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1166:   #endif

1168:   /* perform the solve analysis */
1169:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1170:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1171:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1172:                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1173:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174:                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1175:                           #endif
1176: );CHKERRCUSPARSE(stat);
1177:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1178:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1180:   /* assign the pointer */
1181:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1182:   return(0);
1183: }

1185: struct PetscScalarToPetscInt
1186: {
1187:   __host__ __device__
1188:   PetscInt operator()(PetscScalar s)
1189:   {
1190:     return (PetscInt)PetscRealPart(s);
1191:   }
1192: };

1194: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1195: {
1196:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1197:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1198:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1199:   cusparseStatus_t             stat;
1200:   cusparseIndexBase_t          indexBase;
1201:   cudaError_t                  err;
1202:   PetscErrorCode               ierr;

1205:   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) return(0);
1206:   MatSeqAIJCUSPARSECopyToGPU(A);
1207:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1208:   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1209:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1210:   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1211:   if (A->transupdated) return(0);
1212:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1213:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1214:     MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1215:   }
1216:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1217:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1218:     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1219:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1220:     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1221:     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1223:     /* set alpha and beta */
1224:     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1225:     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1226:     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1227:     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1228:     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1229:     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

1231:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232:       CsrMatrix *matrixT = new CsrMatrix;
1233:       matstructT->mat = matrixT;
1234:       matrixT->num_rows = A->cmap->n;
1235:       matrixT->num_cols = A->rmap->n;
1236:       matrixT->num_entries = a->nz;
1237:       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1238:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1239:       matrixT->values = new THRUSTARRAY(a->nz);

1241:       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1242:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);

1244:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1245:       stat = cusparseCreateCsr(&matstructT->matDescr,
1246:                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1247:                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1248:                                matrixT->values->data().get(),
1249:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1250:                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1251:      #endif
1252:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1253:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1254:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1255:    #else
1256:       CsrMatrix *temp  = new CsrMatrix;
1257:       CsrMatrix *tempT = new CsrMatrix;
1258:       /* First convert HYB to CSR */
1259:       temp->num_rows = A->rmap->n;
1260:       temp->num_cols = A->cmap->n;
1261:       temp->num_entries = a->nz;
1262:       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1263:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1264:       temp->values = new THRUSTARRAY(a->nz);

1266:       stat = cusparse_hyb2csr(cusparsestruct->handle,
1267:                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1268:                               temp->values->data().get(),
1269:                               temp->row_offsets->data().get(),
1270:                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);

1272:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1273:       tempT->num_rows = A->rmap->n;
1274:       tempT->num_cols = A->cmap->n;
1275:       tempT->num_entries = a->nz;
1276:       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1278:       tempT->values = new THRUSTARRAY(a->nz);

1280:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1281:                               temp->num_cols, temp->num_entries,
1282:                               temp->values->data().get(),
1283:                               temp->row_offsets->data().get(),
1284:                               temp->column_indices->data().get(),
1285:                               tempT->values->data().get(),
1286:                               tempT->column_indices->data().get(),
1287:                               tempT->row_offsets->data().get(),
1288:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

1290:       /* Last, convert CSC to HYB */
1291:       cusparseHybMat_t hybMat;
1292:       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1293:       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1294:         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1295:       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1296:                               matstructT->descr, tempT->values->data().get(),
1297:                               tempT->row_offsets->data().get(),
1298:                               tempT->column_indices->data().get(),
1299:                               hybMat, 0, partition);CHKERRCUSPARSE(stat);

1301:       /* assign the pointer */
1302:       matstructT->mat = hybMat;
1303:       A->transupdated = PETSC_TRUE;
1304:       /* delete temporaries */
1305:       if (tempT) {
1306:         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1307:         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1308:         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1309:         delete (CsrMatrix*) tempT;
1310:       }
1311:       if (temp) {
1312:         if (temp->values) delete (THRUSTARRAY*) temp->values;
1313:         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1314:         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1315:         delete (CsrMatrix*) temp;
1316:       }
1317:      #endif
1318:     }
1319:   }
1320:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1321:     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1322:     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1323:     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1324:     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1325:     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1326:     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1327:     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1328:     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1329:     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1330:     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1331:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1332:       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1333:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1334:       PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1335:     }
1336:     if (!cusparsestruct->csr2csc_i) {
1337:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1338:       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1340:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1341:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1342:       void   *csr2cscBuffer;
1343:       size_t csr2cscBufferSize;
1344:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1345:                                            A->cmap->n, matrix->num_entries,
1346:                                            matrix->values->data().get(),
1347:                                            cusparsestruct->rowoffsets_gpu->data().get(),
1348:                                            matrix->column_indices->data().get(),
1349:                                            matrixT->values->data().get(),
1350:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1351:                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1352:                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1353:       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1354:      #endif

1356:       if (matrix->num_entries) {
1357:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1358:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1359:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1361:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1362:            should be filled with indexBase. So I just take a shortcut here.
1363:         */
1364:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1365:                               A->cmap->n,matrix->num_entries,
1366:                               csr2csc_a.data().get(),
1367:                               cusparsestruct->rowoffsets_gpu->data().get(),
1368:                               matrix->column_indices->data().get(),
1369:                               matrixT->values->data().get(),
1370:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371:                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1372:                               CUSPARSE_ACTION_NUMERIC,indexBase,
1373:                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1374:                              #else
1375:                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1376:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1377:                              #endif
1378:       } else {
1379:         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1380:       }

1382:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1383:       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1384:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385:       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1386:      #endif
1387:     }
1388:     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1389:                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1390:                                                      matrixT->values->begin()));
1391:   }
1392:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1393:   /* the compressed row indices is not used for matTranspose */
1394:   matstructT->cprowIndices = NULL;
1395:   /* assign the pointer */
1396:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1397:   A->transupdated = PETSC_TRUE;
1398:   return(0);
1399: }

1401: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1402: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1403: {
1404:   PetscInt                              n = xx->map->n;
1405:   const PetscScalar                     *barray;
1406:   PetscScalar                           *xarray;
1407:   thrust::device_ptr<const PetscScalar> bGPU;
1408:   thrust::device_ptr<PetscScalar>       xGPU;
1409:   cusparseStatus_t                      stat;
1410:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1411:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1412:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1413:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1414:   PetscErrorCode                        ierr;
1415:   cudaError_t                           cerr;

1418:   /* Analyze the matrix and create the transpose ... on the fly */
1419:   if (!loTriFactorT && !upTriFactorT) {
1420:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1421:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423:   }

1425:   /* Get the GPU pointers */
1426:   VecCUDAGetArrayWrite(xx,&xarray);
1427:   VecCUDAGetArrayRead(bb,&barray);
1428:   xGPU = thrust::device_pointer_cast(xarray);
1429:   bGPU = thrust::device_pointer_cast(barray);

1431:   PetscLogGpuTimeBegin();
1432:   /* First, reorder with the row permutation */
1433:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1434:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1435:                xGPU);

1437:   /* First, solve U */
1438:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1439:                         upTriFactorT->csrMat->num_rows,
1440:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1441:                         upTriFactorT->csrMat->num_entries,
1442:                       #endif
1443:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1444:                         upTriFactorT->csrMat->values->data().get(),
1445:                         upTriFactorT->csrMat->row_offsets->data().get(),
1446:                         upTriFactorT->csrMat->column_indices->data().get(),
1447:                         upTriFactorT->solveInfo,
1448:                         xarray, tempGPU->data().get()
1449:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1450:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1451:                       #endif
1452: );CHKERRCUSPARSE(stat);

1454:   /* Then, solve L */
1455:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1456:                         loTriFactorT->csrMat->num_rows,
1457:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1458:                         loTriFactorT->csrMat->num_entries,
1459:                       #endif
1460:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1461:                         loTriFactorT->csrMat->values->data().get(),
1462:                         loTriFactorT->csrMat->row_offsets->data().get(),
1463:                         loTriFactorT->csrMat->column_indices->data().get(),
1464:                         loTriFactorT->solveInfo,
1465:                         tempGPU->data().get(), xarray
1466:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1467:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1468:                       #endif
1469: );CHKERRCUSPARSE(stat);

1471:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1472:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1473:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1474:                tempGPU->begin());

1476:   /* Copy the temporary to the full solution. */
1477:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);

1479:   /* restore */
1480:   VecCUDARestoreArrayRead(bb,&barray);
1481:   VecCUDARestoreArrayWrite(xx,&xarray);
1482:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1483:   PetscLogGpuTimeEnd();
1484:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1485:   return(0);
1486: }

1488: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1489: {
1490:   const PetscScalar                 *barray;
1491:   PetscScalar                       *xarray;
1492:   cusparseStatus_t                  stat;
1493:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1494:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1495:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1496:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1497:   PetscErrorCode                    ierr;
1498:   cudaError_t                       cerr;

1501:   /* Analyze the matrix and create the transpose ... on the fly */
1502:   if (!loTriFactorT && !upTriFactorT) {
1503:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1504:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1505:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1506:   }

1508:   /* Get the GPU pointers */
1509:   VecCUDAGetArrayWrite(xx,&xarray);
1510:   VecCUDAGetArrayRead(bb,&barray);

1512:   PetscLogGpuTimeBegin();
1513:   /* First, solve U */
1514:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1515:                         upTriFactorT->csrMat->num_rows,
1516:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1517:                         upTriFactorT->csrMat->num_entries,
1518:                       #endif
1519:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1520:                         upTriFactorT->csrMat->values->data().get(),
1521:                         upTriFactorT->csrMat->row_offsets->data().get(),
1522:                         upTriFactorT->csrMat->column_indices->data().get(),
1523:                         upTriFactorT->solveInfo,
1524:                         barray, tempGPU->data().get()
1525:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1526:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1527:                       #endif
1528: );CHKERRCUSPARSE(stat);

1530:   /* Then, solve L */
1531:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1532:                         loTriFactorT->csrMat->num_rows,
1533:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1534:                         loTriFactorT->csrMat->num_entries,
1535:                       #endif
1536:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1537:                         loTriFactorT->csrMat->values->data().get(),
1538:                         loTriFactorT->csrMat->row_offsets->data().get(),
1539:                         loTriFactorT->csrMat->column_indices->data().get(),
1540:                         loTriFactorT->solveInfo,
1541:                         tempGPU->data().get(), xarray
1542:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1544:                       #endif
1545: );CHKERRCUSPARSE(stat);

1547:   /* restore */
1548:   VecCUDARestoreArrayRead(bb,&barray);
1549:   VecCUDARestoreArrayWrite(xx,&xarray);
1550:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1551:   PetscLogGpuTimeEnd();
1552:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1553:   return(0);
1554: }

1556: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1557: {
1558:   const PetscScalar                     *barray;
1559:   PetscScalar                           *xarray;
1560:   thrust::device_ptr<const PetscScalar> bGPU;
1561:   thrust::device_ptr<PetscScalar>       xGPU;
1562:   cusparseStatus_t                      stat;
1563:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1564:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1565:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1566:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1567:   PetscErrorCode                        ierr;
1568:   cudaError_t                           cerr;


1572:   /* Get the GPU pointers */
1573:   VecCUDAGetArrayWrite(xx,&xarray);
1574:   VecCUDAGetArrayRead(bb,&barray);
1575:   xGPU = thrust::device_pointer_cast(xarray);
1576:   bGPU = thrust::device_pointer_cast(barray);

1578:   PetscLogGpuTimeBegin();
1579:   /* First, reorder with the row permutation */
1580:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1581:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1582:                tempGPU->begin());

1584:   /* Next, solve L */
1585:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1586:                         loTriFactor->csrMat->num_rows,
1587:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1588:                         loTriFactor->csrMat->num_entries,
1589:                       #endif
1590:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1591:                         loTriFactor->csrMat->values->data().get(),
1592:                         loTriFactor->csrMat->row_offsets->data().get(),
1593:                         loTriFactor->csrMat->column_indices->data().get(),
1594:                         loTriFactor->solveInfo,
1595:                         tempGPU->data().get(), xarray
1596:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1597:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1598:                       #endif
1599: );CHKERRCUSPARSE(stat);

1601:   /* Then, solve U */
1602:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1603:                         upTriFactor->csrMat->num_rows,
1604:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1605:                         upTriFactor->csrMat->num_entries,
1606:                       #endif
1607:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1608:                         upTriFactor->csrMat->values->data().get(),
1609:                         upTriFactor->csrMat->row_offsets->data().get(),
1610:                         upTriFactor->csrMat->column_indices->data().get(),
1611:                         upTriFactor->solveInfo,
1612:                         xarray, tempGPU->data().get()
1613:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1614:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1615:                       #endif
1616: );CHKERRCUSPARSE(stat);

1618:   /* Last, reorder with the column permutation */
1619:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1620:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1621:                xGPU);

1623:   VecCUDARestoreArrayRead(bb,&barray);
1624:   VecCUDARestoreArrayWrite(xx,&xarray);
1625:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1626:   PetscLogGpuTimeEnd();
1627:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1628:   return(0);
1629: }

1631: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1632: {
1633:   const PetscScalar                 *barray;
1634:   PetscScalar                       *xarray;
1635:   cusparseStatus_t                  stat;
1636:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1637:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1638:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1639:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1640:   PetscErrorCode                    ierr;
1641:   cudaError_t                       cerr;

1644:   /* Get the GPU pointers */
1645:   VecCUDAGetArrayWrite(xx,&xarray);
1646:   VecCUDAGetArrayRead(bb,&barray);

1648:   PetscLogGpuTimeBegin();
1649:   /* First, solve L */
1650:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1651:                         loTriFactor->csrMat->num_rows,
1652:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1653:                         loTriFactor->csrMat->num_entries,
1654:                       #endif
1655:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1656:                         loTriFactor->csrMat->values->data().get(),
1657:                         loTriFactor->csrMat->row_offsets->data().get(),
1658:                         loTriFactor->csrMat->column_indices->data().get(),
1659:                         loTriFactor->solveInfo,
1660:                         barray, tempGPU->data().get()
1661:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1662:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1663:                       #endif
1664: );CHKERRCUSPARSE(stat);

1666:   /* Next, solve U */
1667:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1668:                         upTriFactor->csrMat->num_rows,
1669:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1670:                         upTriFactor->csrMat->num_entries,
1671:                       #endif
1672:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1673:                         upTriFactor->csrMat->values->data().get(),
1674:                         upTriFactor->csrMat->row_offsets->data().get(),
1675:                         upTriFactor->csrMat->column_indices->data().get(),
1676:                         upTriFactor->solveInfo,
1677:                         tempGPU->data().get(), xarray
1678:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1680:                       #endif
1681: );CHKERRCUSPARSE(stat);

1683:   VecCUDARestoreArrayRead(bb,&barray);
1684:   VecCUDARestoreArrayWrite(xx,&xarray);
1685:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1686:   PetscLogGpuTimeEnd();
1687:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1688:   return(0);
1689: }

1691: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1692: {
1693:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1694:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1695:   cudaError_t        cerr;
1696:   PetscErrorCode     ierr;

1699:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1700:     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;

1702:     PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1703:     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1704:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1705:     PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1706:     PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1707:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1708:   }
1709:   return(0);
1710: }

1712: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1713: {
1714:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

1718:   MatSeqAIJCUSPARSECopyFromGPU(A);
1719:   *array = a->a;
1720:   A->offloadmask = PETSC_OFFLOAD_CPU;
1721:   return(0);
1722: }

1724: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1725: {
1726:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1727:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1728:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1729:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1730:   PetscErrorCode               ierr;
1731:   cusparseStatus_t             stat;
1732:   PetscBool                    both = PETSC_TRUE;
1733:   cudaError_t                  err;

1736:   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1737:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1738:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1739:       CsrMatrix *matrix;
1740:       matrix = (CsrMatrix*)cusparsestruct->mat->mat;

1742:       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1743:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1744:       matrix->values->assign(a->a, a->a+a->nz);
1745:       err  = WaitForCUDA();CHKERRCUDA(err);
1746:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1747:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1748:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1749:     } else {
1750:       PetscInt nnz;
1751:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1752:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1753:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1754:       delete cusparsestruct->workVector;
1755:       delete cusparsestruct->rowoffsets_gpu;
1756:       cusparsestruct->workVector = NULL;
1757:       cusparsestruct->rowoffsets_gpu = NULL;
1758:       try {
1759:         if (a->compressedrow.use) {
1760:           m    = a->compressedrow.nrows;
1761:           ii   = a->compressedrow.i;
1762:           ridx = a->compressedrow.rindex;
1763:         } else {
1764:           m    = A->rmap->n;
1765:           ii   = a->i;
1766:           ridx = NULL;
1767:         }
1768:         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1769:         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1770:         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1771:         else nnz = a->nz;

1773:         /* create cusparse matrix */
1774:         cusparsestruct->nrows = m;
1775:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1776:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1777:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1778:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1780:         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1781:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1782:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1783:         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1784:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1785:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1786:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1788:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1789:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1790:           /* set the matrix */
1791:           CsrMatrix *mat= new CsrMatrix;
1792:           mat->num_rows = m;
1793:           mat->num_cols = A->cmap->n;
1794:           mat->num_entries = nnz;
1795:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1796:           mat->row_offsets->assign(ii, ii + m+1);

1798:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1799:           mat->column_indices->assign(a->j, a->j+nnz);

1801:           mat->values = new THRUSTARRAY(nnz);
1802:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1804:           /* assign the pointer */
1805:           matstruct->mat = mat;
1806:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1807:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1808:             stat = cusparseCreateCsr(&matstruct->matDescr,
1809:                                     mat->num_rows, mat->num_cols, mat->num_entries,
1810:                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1811:                                     mat->values->data().get(),
1812:                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1813:                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1814:           }
1815:          #endif
1816:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1817:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1818:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1819:          #else
1820:           CsrMatrix *mat= new CsrMatrix;
1821:           mat->num_rows = m;
1822:           mat->num_cols = A->cmap->n;
1823:           mat->num_entries = nnz;
1824:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1825:           mat->row_offsets->assign(ii, ii + m+1);

1827:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1828:           mat->column_indices->assign(a->j, a->j+nnz);

1830:           mat->values = new THRUSTARRAY(nnz);
1831:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1833:           cusparseHybMat_t hybMat;
1834:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1835:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1836:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1837:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1838:               matstruct->descr, mat->values->data().get(),
1839:               mat->row_offsets->data().get(),
1840:               mat->column_indices->data().get(),
1841:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1842:           /* assign the pointer */
1843:           matstruct->mat = hybMat;

1845:           if (mat) {
1846:             if (mat->values) delete (THRUSTARRAY*)mat->values;
1847:             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1848:             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1849:             delete (CsrMatrix*)mat;
1850:           }
1851:          #endif
1852:         }

1854:         /* assign the compressed row indices */
1855:         if (a->compressedrow.use) {
1856:           cusparsestruct->workVector = new THRUSTARRAY(m);
1857:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1858:           matstruct->cprowIndices->assign(ridx,ridx+m);
1859:           tmp = m;
1860:         } else {
1861:           cusparsestruct->workVector = NULL;
1862:           matstruct->cprowIndices    = NULL;
1863:           tmp = 0;
1864:         }
1865:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1867:         /* assign the pointer */
1868:         cusparsestruct->mat = matstruct;
1869:       } catch(char *ex) {
1870:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1871:       }
1872:       err  = WaitForCUDA();CHKERRCUDA(err);
1873:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1874:       cusparsestruct->nonzerostate = A->nonzerostate;
1875:     }
1876:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1877:   }
1878:   return(0);
1879: }

1881: struct VecCUDAPlusEquals
1882: {
1883:   template <typename Tuple>
1884:   __host__ __device__
1885:   void operator()(Tuple t)
1886:   {
1887:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1888:   }
1889: };

1891: struct VecCUDAEquals
1892: {
1893:   template <typename Tuple>
1894:   __host__ __device__
1895:   void operator()(Tuple t)
1896:   {
1897:     thrust::get<1>(t) = thrust::get<0>(t);
1898:   }
1899: };

1901: struct VecCUDAEqualsReverse
1902: {
1903:   template <typename Tuple>
1904:   __host__ __device__
1905:   void operator()(Tuple t)
1906:   {
1907:     thrust::get<0>(t) = thrust::get<1>(t);
1908:   }
1909: };

1911: struct MatMatCusparse {
1912:   PetscBool             cisdense;
1913:   PetscScalar           *Bt;
1914:   Mat                   X;
1915:   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1916:   PetscLogDouble        flops;
1917:   CsrMatrix             *Bcsr;
1918: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1919:   cusparseSpMatDescr_t  matSpBDescr;
1920:   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1921:   cusparseDnMatDescr_t  matBDescr;
1922:   cusparseDnMatDescr_t  matCDescr;
1923:   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1924:   size_t                mmBufferSize;
1925:   void                  *mmBuffer;
1926:   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1927:   cusparseSpGEMMDescr_t spgemmDesc;
1928: #endif
1929: };

1931: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1932: {
1933:   PetscErrorCode   ierr;
1934:   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1935:   cudaError_t      cerr;
1936:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1937:   cusparseStatus_t stat;
1938:  #endif

1941:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1942:   delete mmdata->Bcsr;
1943:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1944:   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1945:   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1946:   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1947:   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1948:   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1949:   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1950:  #endif
1951:   MatDestroy(&mmdata->X);
1952:   PetscFree(data);
1953:   return(0);
1954: }

1956: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

1958: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1959: {
1960:   Mat_Product                  *product = C->product;
1961:   Mat                          A,B;
1962:   PetscInt                     m,n,blda,clda;
1963:   PetscBool                    flg,biscuda;
1964:   Mat_SeqAIJCUSPARSE           *cusp;
1965:   cusparseStatus_t             stat;
1966:   cusparseOperation_t          opA;
1967:   const PetscScalar            *barray;
1968:   PetscScalar                  *carray;
1969:   PetscErrorCode               ierr;
1970:   MatMatCusparse               *mmdata;
1971:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1972:   CsrMatrix                    *csrmat;
1973:   cudaError_t                  cerr;

1976:   MatCheckProduct(C,1);
1977:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
1978:   mmdata = (MatMatCusparse*)product->data;
1979:   A    = product->A;
1980:   B    = product->B;
1981:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1982:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
1983:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1984:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1985:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1986:   MatSeqAIJCUSPARSECopyToGPU(A);
1987:   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
1988:   switch (product->type) {
1989:   case MATPRODUCT_AB:
1990:   case MATPRODUCT_PtAP:
1991:     mat = cusp->mat;
1992:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1993:     m   = A->rmap->n;
1994:     n   = B->cmap->n;
1995:     break;
1996:   case MATPRODUCT_AtB:
1997:     if (!A->form_explicit_transpose) {
1998:       mat = cusp->mat;
1999:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2000:     } else {
2001:       MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2002:       mat  = cusp->matTranspose;
2003:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2004:     }
2005:     m = A->cmap->n;
2006:     n = B->cmap->n;
2007:     break;
2008:   case MATPRODUCT_ABt:
2009:   case MATPRODUCT_RARt:
2010:     mat = cusp->mat;
2011:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2012:     m   = A->rmap->n;
2013:     n   = B->rmap->n;
2014:     break;
2015:   default:
2016:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2017:   }
2018:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2019:   csrmat = (CsrMatrix*)mat->mat;
2020:   /* if the user passed a CPU matrix, copy the data to the GPU */
2021:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2022:   if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
2023:   MatDenseCUDAGetArrayRead(B,&barray);

2025:   MatDenseGetLDA(B,&blda);
2026:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2027:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2028:     MatDenseGetLDA(mmdata->X,&clda);
2029:   } else {
2030:     MatDenseCUDAGetArrayWrite(C,&carray);
2031:     MatDenseGetLDA(C,&clda);
2032:   }

2034:   PetscLogGpuTimeBegin();
2035:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2036:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2037:   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2038:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2039:     size_t mmBufferSize;
2040:     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2041:     if (!mmdata->matBDescr) {
2042:       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2043:       mmdata->Blda = blda;
2044:     }

2046:     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2047:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2048:       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2049:       mmdata->Clda = clda;
2050:     }

2052:     if (!mat->matDescr) {
2053:       stat = cusparseCreateCsr(&mat->matDescr,
2054:                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2055:                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2056:                                csrmat->values->data().get(),
2057:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2058:                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2059:     }
2060:     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2061:                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2062:                                    mmdata->matCDescr,cusparse_scalartype,
2063:                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2064:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2065:       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2066:       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2067:       mmdata->mmBufferSize = mmBufferSize;
2068:     }
2069:     mmdata->initialized = PETSC_TRUE;
2070:   } else {
2071:     /* to be safe, always update pointers of the mats */
2072:     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2073:     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2074:     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2075:   }

2077:   /* do cusparseSpMM, which supports transpose on B */
2078:   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2079:                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2080:                       mmdata->matCDescr,cusparse_scalartype,
2081:                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2082:  #else
2083:   PetscInt k;
2084:   /* cusparseXcsrmm does not support transpose on B */
2085:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2086:     cublasHandle_t cublasv2handle;
2087:     cublasStatus_t cerr;

2089:     PetscCUBLASGetHandle(&cublasv2handle);
2090:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2091:                        B->cmap->n,B->rmap->n,
2092:                        &PETSC_CUSPARSE_ONE ,barray,blda,
2093:                        &PETSC_CUSPARSE_ZERO,barray,blda,
2094:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2095:     blda = B->cmap->n;
2096:     k    = B->cmap->n;
2097:   } else {
2098:     k    = B->rmap->n;
2099:   }

2101:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2102:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2103:                            csrmat->num_entries,mat->alpha_one,mat->descr,
2104:                            csrmat->values->data().get(),
2105:                            csrmat->row_offsets->data().get(),
2106:                            csrmat->column_indices->data().get(),
2107:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2108:                            carray,clda);CHKERRCUSPARSE(stat);
2109:  #endif
2110:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2111:   PetscLogGpuTimeEnd();
2112:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2113:   MatDenseCUDARestoreArrayRead(B,&barray);
2114:   if (product->type == MATPRODUCT_RARt) {
2115:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2116:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2117:   } else if (product->type == MATPRODUCT_PtAP) {
2118:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2119:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2120:   } else {
2121:     MatDenseCUDARestoreArrayWrite(C,&carray);
2122:   }
2123:   if (mmdata->cisdense) {
2124:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2125:   }
2126:   if (!biscuda) {
2127:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2128:   }
2129:   return(0);
2130: }

2132: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2133: {
2134:   Mat_Product        *product = C->product;
2135:   Mat                A,B;
2136:   PetscInt           m,n;
2137:   PetscBool          cisdense,flg;
2138:   PetscErrorCode     ierr;
2139:   MatMatCusparse     *mmdata;
2140:   Mat_SeqAIJCUSPARSE *cusp;

2143:   MatCheckProduct(C,1);
2144:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2145:   A    = product->A;
2146:   B    = product->B;
2147:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2148:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2149:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2150:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2151:   switch (product->type) {
2152:   case MATPRODUCT_AB:
2153:     m = A->rmap->n;
2154:     n = B->cmap->n;
2155:     break;
2156:   case MATPRODUCT_AtB:
2157:     m = A->cmap->n;
2158:     n = B->cmap->n;
2159:     break;
2160:   case MATPRODUCT_ABt:
2161:     m = A->rmap->n;
2162:     n = B->rmap->n;
2163:     break;
2164:   case MATPRODUCT_PtAP:
2165:     m = B->cmap->n;
2166:     n = B->cmap->n;
2167:     break;
2168:   case MATPRODUCT_RARt:
2169:     m = B->rmap->n;
2170:     n = B->rmap->n;
2171:     break;
2172:   default:
2173:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2174:   }
2175:   MatSetSizes(C,m,n,m,n);
2176:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2177:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2178:   MatSetType(C,MATSEQDENSECUDA);

2180:   /* product data */
2181:   PetscNew(&mmdata);
2182:   mmdata->cisdense = cisdense;
2183:  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2184:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2185:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2186:     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2187:   }
2188:  #endif
2189:   /* for these products we need intermediate storage */
2190:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2191:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2192:     MatSetType(mmdata->X,MATSEQDENSECUDA);
2193:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2194:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2195:     } else {
2196:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2197:     }
2198:   }
2199:   C->product->data    = mmdata;
2200:   C->product->destroy = MatDestroy_MatMatCusparse;

2202:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2203:   return(0);
2204: }

2206: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2207: {
2208:   Mat_Product                  *product = C->product;
2209:   Mat                          A,B;
2210:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2211:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2212:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2213:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2214:   PetscBool                    flg;
2215:   PetscErrorCode               ierr;
2216:   cusparseStatus_t             stat;
2217:   cudaError_t                  cerr;
2218:   MatProductType               ptype;
2219:   MatMatCusparse               *mmdata;
2220: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2221:   cusparseSpMatDescr_t         BmatSpDescr;
2222: #endif

2225:   MatCheckProduct(C,1);
2226:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2227:   PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2228:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2229:   mmdata = (MatMatCusparse*)C->product->data;
2230:   A = product->A;
2231:   B = product->B;
2232:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2233:     mmdata->reusesym = PETSC_FALSE;
2234:     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2235:     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2236:     Cmat = Ccusp->mat;
2237:     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2238:     Ccsr = (CsrMatrix*)Cmat->mat;
2239:     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2240:     goto finalize;
2241:   }
2242:   if (!c->nz) goto finalize;
2243:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2244:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2245:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2246:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2247:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2248:   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2249:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2250:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2251:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2252:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2253:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2254:   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2255:   MatSeqAIJCUSPARSECopyToGPU(A);
2256:   MatSeqAIJCUSPARSECopyToGPU(B);

2258:   ptype = product->type;
2259:   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2260:   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2261:   switch (ptype) {
2262:   case MATPRODUCT_AB:
2263:     Amat = Acusp->mat;
2264:     Bmat = Bcusp->mat;
2265:     break;
2266:   case MATPRODUCT_AtB:
2267:     Amat = Acusp->matTranspose;
2268:     Bmat = Bcusp->mat;
2269:     break;
2270:   case MATPRODUCT_ABt:
2271:     Amat = Acusp->mat;
2272:     Bmat = Bcusp->matTranspose;
2273:     break;
2274:   default:
2275:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2276:   }
2277:   Cmat = Ccusp->mat;
2278:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2279:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2280:   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2281:   Acsr = (CsrMatrix*)Amat->mat;
2282:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2283:   Ccsr = (CsrMatrix*)Cmat->mat;
2284:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2285:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2286:   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2287:   PetscLogGpuTimeBegin();
2288: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2289:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2290:   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2291:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2292:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2293:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2294:   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2295:                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2296:                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2297: #else
2298:   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2299:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2300:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2301:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2302:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2303: #endif
2304:   PetscLogGpuFlops(mmdata->flops);
2305:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2306:   PetscLogGpuTimeEnd();
2307:   C->offloadmask = PETSC_OFFLOAD_GPU;
2308: finalize:
2309:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2310:   PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);
2311:   PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2312:   PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);
2313:   c->reallocs         = 0;
2314:   C->info.mallocs    += 0;
2315:   C->info.nz_unneeded = 0;
2316:   C->assembled = C->was_assembled = PETSC_TRUE;
2317:   C->num_ass++;
2318:   return(0);
2319: }

2321: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2322: {
2323:   Mat_Product                  *product = C->product;
2324:   Mat                          A,B;
2325:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2326:   Mat_SeqAIJ                   *a,*b,*c;
2327:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2328:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2329:   PetscInt                     i,j,m,n,k;
2330:   PetscBool                    flg;
2331:   PetscErrorCode               ierr;
2332:   cusparseStatus_t             stat;
2333:   cudaError_t                  cerr;
2334:   MatProductType               ptype;
2335:   MatMatCusparse               *mmdata;
2336:   PetscLogDouble               flops;
2337:   PetscBool                    biscompressed,ciscompressed;
2338: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2339:   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2340:   size_t                       bufSize2;
2341:   cusparseSpMatDescr_t         BmatSpDescr;
2342: #else
2343:   int                          cnz;
2344: #endif

2347:   MatCheckProduct(C,1);
2348:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2349:   A    = product->A;
2350:   B    = product->B;
2351:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2352:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2353:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2354:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2355:   a = (Mat_SeqAIJ*)A->data;
2356:   b = (Mat_SeqAIJ*)B->data;
2357:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2358:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2359:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2360:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");

2362:   /* product data */
2363:   PetscNew(&mmdata);
2364:   C->product->data    = mmdata;
2365:   C->product->destroy = MatDestroy_MatMatCusparse;

2367:   MatSeqAIJCUSPARSECopyToGPU(A);
2368:   MatSeqAIJCUSPARSECopyToGPU(B);
2369:   ptype = product->type;
2370:   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2371:   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2372:   biscompressed = PETSC_FALSE;
2373:   ciscompressed = PETSC_FALSE;
2374:   switch (ptype) {
2375:   case MATPRODUCT_AB:
2376:     m = A->rmap->n;
2377:     n = B->cmap->n;
2378:     k = A->cmap->n;
2379:     Amat = Acusp->mat;
2380:     Bmat = Bcusp->mat;
2381:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2382:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2383:     break;
2384:   case MATPRODUCT_AtB:
2385:     m = A->cmap->n;
2386:     n = B->cmap->n;
2387:     k = A->rmap->n;
2388:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2389:     Amat = Acusp->matTranspose;
2390:     Bmat = Bcusp->mat;
2391:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2392:     break;
2393:   case MATPRODUCT_ABt:
2394:     m = A->rmap->n;
2395:     n = B->rmap->n;
2396:     k = A->cmap->n;
2397:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
2398:     Amat = Acusp->mat;
2399:     Bmat = Bcusp->matTranspose;
2400:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2401:     break;
2402:   default:
2403:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2404:   }

2406:   /* create cusparse matrix */
2407:   MatSetSizes(C,m,n,m,n);
2408:   MatSetType(C,MATSEQAIJCUSPARSE);
2409:   c     = (Mat_SeqAIJ*)C->data;
2410:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2411:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2412:   Ccsr  = new CsrMatrix;

2414:   c->compressedrow.use = ciscompressed;
2415:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2416:     c->compressedrow.nrows = a->compressedrow.nrows;
2417:     PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2418:     PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2419:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2420:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2421:     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2422:   } else {
2423:     c->compressedrow.nrows  = 0;
2424:     c->compressedrow.i      = NULL;
2425:     c->compressedrow.rindex = NULL;
2426:     Ccusp->workVector       = NULL;
2427:     Cmat->cprowIndices      = NULL;
2428:   }
2429:   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2430:   Ccusp->mat      = Cmat;
2431:   Ccusp->mat->mat = Ccsr;
2432:   Ccsr->num_rows    = Ccusp->nrows;
2433:   Ccsr->num_cols    = n;
2434:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2435:   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2436:   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2437:   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2438:   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2439:   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2440:   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2441:   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2442:   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2443:   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2444:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2445:     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2446:     c->nz = 0;
2447:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2448:     Ccsr->values = new THRUSTARRAY(c->nz);
2449:     goto finalizesym;
2450:   }

2452:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2453:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2454:   Acsr = (CsrMatrix*)Amat->mat;
2455:   if (!biscompressed) {
2456:     Bcsr = (CsrMatrix*)Bmat->mat;
2457: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2458:     BmatSpDescr = Bmat->matDescr;
2459: #endif
2460:   } else { /* we need to use row offsets for the full matrix */
2461:     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2462:     Bcsr = new CsrMatrix;
2463:     Bcsr->num_rows       = B->rmap->n;
2464:     Bcsr->num_cols       = cBcsr->num_cols;
2465:     Bcsr->num_entries    = cBcsr->num_entries;
2466:     Bcsr->column_indices = cBcsr->column_indices;
2467:     Bcsr->values         = cBcsr->values;
2468:     if (!Bcusp->rowoffsets_gpu) {
2469:       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2470:       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2471:       PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2472:     }
2473:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2474:     mmdata->Bcsr = Bcsr;
2475: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2476:     if (Bcsr->num_rows && Bcsr->num_cols) {
2477:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2478:                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2479:                                Bcsr->values->data().get(),
2480:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2481:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2482:     }
2483:     BmatSpDescr = mmdata->matSpBDescr;
2484: #endif
2485:   }
2486:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2487:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2488:   /* precompute flops count */
2489:   if (ptype == MATPRODUCT_AB) {
2490:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2491:       const PetscInt st = a->i[i];
2492:       const PetscInt en = a->i[i+1];
2493:       for (j=st; j<en; j++) {
2494:         const PetscInt brow = a->j[j];
2495:         flops += 2.*(b->i[brow+1] - b->i[brow]);
2496:       }
2497:     }
2498:   } else if (ptype == MATPRODUCT_AtB) {
2499:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2500:       const PetscInt anzi = a->i[i+1] - a->i[i];
2501:       const PetscInt bnzi = b->i[i+1] - b->i[i];
2502:       flops += (2.*anzi)*bnzi;
2503:     }
2504:   } else { /* TODO */
2505:     flops = 0.;
2506:   }

2508:   mmdata->flops = flops;
2509:   PetscLogGpuTimeBegin();
2510: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2511:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2512:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2513:                            NULL, NULL, NULL,
2514:                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2515:                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2516:   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2517:   /* ask bufferSize bytes for external memory */
2518:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2519:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2520:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2521:                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2522:   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2523:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2524:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2525:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2526:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2527:                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2528:   /* ask bufferSize again bytes for external memory */
2529:   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2530:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2531:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2532:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2533:   /* The CUSPARSE documentation is not clear, nor the API
2534:      We need both buffers to perform the operations properly!
2535:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2536:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2537:      is stored in the descriptor! What a messy API... */
2538:   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2539:   /* compute the intermediate product of A * B */
2540:   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2541:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2542:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2543:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2544:   /* get matrix C non-zero entries C_nnz1 */
2545:   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2546:   c->nz = (PetscInt) C_nnz1;
2547:   PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2548:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2549:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2550:   Ccsr->values = new THRUSTARRAY(c->nz);
2551:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2552:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2553:                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2554:   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555:                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556:                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2557: #else
2558:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2559:   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2560:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2561:                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2562:                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2563:                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2564:   c->nz = cnz;
2565:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2566:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2567:   Ccsr->values = new THRUSTARRAY(c->nz);
2568:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

2570:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2571:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2572:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2573:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2574:   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2575:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2576:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2577:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2578:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2579: #endif
2580:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2581:   PetscLogGpuFlops(mmdata->flops);
2582:   PetscLogGpuTimeEnd();
2583: finalizesym:
2584:   c->singlemalloc = PETSC_FALSE;
2585:   c->free_a       = PETSC_TRUE;
2586:   c->free_ij      = PETSC_TRUE;
2587:   PetscMalloc1(m+1,&c->i);
2588:   PetscMalloc1(c->nz,&c->j);
2589:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2590:     PetscInt *d_i = c->i;
2591:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2592:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2593:     ii   = *Ccsr->row_offsets;
2594:     jj   = *Ccsr->column_indices;
2595:     if (ciscompressed) d_i = c->compressedrow.i;
2596:     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2597:     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2598:   } else {
2599:     PetscInt *d_i = c->i;
2600:     if (ciscompressed) d_i = c->compressedrow.i;
2601:     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2602:     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2603:   }
2604:   if (ciscompressed) { /* need to expand host row offsets */
2605:     PetscInt r = 0;
2606:     c->i[0] = 0;
2607:     for (k = 0; k < c->compressedrow.nrows; k++) {
2608:       const PetscInt next = c->compressedrow.rindex[k];
2609:       const PetscInt old = c->compressedrow.i[k];
2610:       for (; r < next; r++) c->i[r+1] = old;
2611:     }
2612:     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2613:   }
2614:   PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2615:   PetscMalloc1(m,&c->ilen);
2616:   PetscMalloc1(m,&c->imax);
2617:   c->maxnz = c->nz;
2618:   c->nonzerorowcnt = 0;
2619:   c->rmax = 0;
2620:   for (k = 0; k < m; k++) {
2621:     const PetscInt nn = c->i[k+1] - c->i[k];
2622:     c->ilen[k] = c->imax[k] = nn;
2623:     c->nonzerorowcnt += (PetscInt)!!nn;
2624:     c->rmax = PetscMax(c->rmax,nn);
2625:   }
2626:   MatMarkDiagonal_SeqAIJ(C);
2627:   PetscMalloc1(c->nz,&c->a);
2628:   Ccsr->num_entries = c->nz;

2630:   C->nonzerostate++;
2631:   PetscLayoutSetUp(C->rmap);
2632:   PetscLayoutSetUp(C->cmap);
2633:   Ccusp->nonzerostate = C->nonzerostate;
2634:   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2635:   C->preallocated  = PETSC_TRUE;
2636:   C->assembled     = PETSC_FALSE;
2637:   C->was_assembled = PETSC_FALSE;
2638:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2639:     mmdata->reusesym = PETSC_TRUE;
2640:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2641:   }
2642:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2643:   return(0);
2644: }

2646: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2648: /* handles sparse or dense B */
2649: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2650: {
2651:   Mat_Product    *product = mat->product;
2653:   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;

2656:   MatCheckProduct(mat,1);
2657:   PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2658:   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2659:     PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2660:   }
2661:   if (product->type == MATPRODUCT_ABC) {
2662:     Ciscusp = PETSC_FALSE;
2663:     if (!product->C->boundtocpu) {
2664:       PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2665:     }
2666:   }
2667:   if (isdense) {
2668:     switch (product->type) {
2669:     case MATPRODUCT_AB:
2670:     case MATPRODUCT_AtB:
2671:     case MATPRODUCT_ABt:
2672:     case MATPRODUCT_PtAP:
2673:     case MATPRODUCT_RARt:
2674:      if (product->A->boundtocpu) {
2675:         MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2676:       } else {
2677:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2678:       }
2679:       break;
2680:     case MATPRODUCT_ABC:
2681:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2682:       break;
2683:     default:
2684:       break;
2685:     }
2686:   } else if (Biscusp && Ciscusp) {
2687:     switch (product->type) {
2688:     case MATPRODUCT_AB:
2689:     case MATPRODUCT_AtB:
2690:     case MATPRODUCT_ABt:
2691:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2692:       break;
2693:     case MATPRODUCT_PtAP:
2694:     case MATPRODUCT_RARt:
2695:     case MATPRODUCT_ABC:
2696:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2697:       break;
2698:     default:
2699:       break;
2700:     }
2701:   } else { /* fallback for AIJ */
2702:     MatProductSetFromOptions_SeqAIJ(mat);
2703:   }
2704:   return(0);
2705: }

2707: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2708: {

2712:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2713:   return(0);
2714: }

2716: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2717: {

2721:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2722:   return(0);
2723: }

2725: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2726: {

2730:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2731:   return(0);
2732: }

2734: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2735: {

2739:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2740:   return(0);
2741: }

2743: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2744: {

2748:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2749:   return(0);
2750: }

2752: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2753: {
2754:   int i = blockIdx.x*blockDim.x + threadIdx.x;
2755:   if (i < n) y[idx[i]] += x[i];
2756: }

2758: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2759: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2760: {
2761:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2762:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2763:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2764:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2765:   PetscErrorCode               ierr;
2766:   cudaError_t                  cerr;
2767:   cusparseStatus_t             stat;
2768:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2769:   PetscBool                    compressed;
2770: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2771:   PetscInt                     nx,ny;
2772: #endif

2775:   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2776:   if (!a->nonzerorowcnt) {
2777:     if (!yy) {VecSet_SeqCUDA(zz,0);}
2778:     else {VecCopy_SeqCUDA(yy,zz);}
2779:     return(0);
2780:   }
2781:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2782:   MatSeqAIJCUSPARSECopyToGPU(A);
2783:   if (!trans) {
2784:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2785:     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2786:   } else {
2787:     if (herm || !A->form_explicit_transpose) {
2788:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2789:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2790:     } else {
2791:       if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);}
2792:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2793:     }
2794:   }
2795:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2796:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2798:   try {
2799:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2800:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2801:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */

2803:     PetscLogGpuTimeBegin();
2804:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2805:       /* z = A x + beta y.
2806:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2807:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2808:       */
2809:       xptr = xarray;
2810:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2811:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2812:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2813:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2814:           allocated to accommodate different uses. So we get the length info directly from mat.
2815:        */
2816:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2817:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2818:         nx = mat->num_cols;
2819:         ny = mat->num_rows;
2820:       }
2821:      #endif
2822:     } else {
2823:       /* z = A^T x + beta y
2824:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2825:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2826:        */
2827:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2828:       dptr = zarray;
2829:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2830:       if (compressed) { /* Scatter x to work vector */
2831:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2832:         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2833:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2834:                          VecCUDAEqualsReverse());
2835:       }
2836:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2837:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2838:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2839:         nx = mat->num_rows;
2840:         ny = mat->num_cols;
2841:       }
2842:      #endif
2843:     }

2845:     /* csr_spmv does y = alpha op(A) x + beta y */
2846:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2847:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2848:       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2849:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2850:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2851:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2852:         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2853:                                 matstruct->matDescr,
2854:                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2855:                                 matstruct->cuSpMV[opA].vecYDescr,
2856:                                 cusparse_scalartype,
2857:                                 cusparsestruct->spmvAlg,
2858:                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2859:         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);

2861:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2862:       } else {
2863:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2864:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2865:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2866:       }

2868:       stat = cusparseSpMV(cusparsestruct->handle, opA,
2869:                                matstruct->alpha_one,
2870:                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2871:                                matstruct->cuSpMV[opA].vecXDescr,
2872:                                beta,
2873:                                matstruct->cuSpMV[opA].vecYDescr,
2874:                                cusparse_scalartype,
2875:                                cusparsestruct->spmvAlg,
2876:                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2877:      #else
2878:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2879:       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2880:                                mat->num_rows, mat->num_cols,
2881:                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2882:                                mat->values->data().get(), mat->row_offsets->data().get(),
2883:                                mat->column_indices->data().get(), xptr, beta,
2884:                                dptr);CHKERRCUSPARSE(stat);
2885:      #endif
2886:     } else {
2887:       if (cusparsestruct->nrows) {
2888:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2889:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2890:        #else
2891:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2892:         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2893:                                  matstruct->alpha_one, matstruct->descr, hybMat,
2894:                                  xptr, beta,
2895:                                  dptr);CHKERRCUSPARSE(stat);
2896:        #endif
2897:       }
2898:     }
2899:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2900:     PetscLogGpuTimeEnd();

2902:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2903:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2904:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2905:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2906:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2907:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2908:         }
2909:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2910:         VecSet_SeqCUDA(zz,0);
2911:       }

2913:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2914:       if (compressed) {
2915:         PetscLogGpuTimeBegin();
2916:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2917:            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2918:            prevent that. So I just add a ScatterAdd kernel.
2919:          */
2920:        #if 0
2921:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2922:         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2923:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2924:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2925:                          VecCUDAPlusEquals());
2926:        #else
2927:         PetscInt n = matstruct->cprowIndices->size();
2928:         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2929:        #endif
2930:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2931:         PetscLogGpuTimeEnd();
2932:       }
2933:     } else {
2934:       if (yy && yy != zz) {
2935:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2936:       }
2937:     }
2938:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
2939:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
2940:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
2941:   } catch(char *ex) {
2942:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2943:   }
2944:   if (yy) {
2945:     PetscLogGpuFlops(2.0*a->nz);
2946:   } else {
2947:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
2948:   }
2949:   return(0);
2950: }

2952: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2953: {

2957:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
2958:   return(0);
2959: }

2961: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2962: {
2963:   PetscErrorCode              ierr;
2964:   PetscSplitCSRDataStructure  *d_mat = NULL;
2966:   if (A->factortype == MAT_FACTOR_NONE) {
2967:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2968:   }
2969:   MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
2970:   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
2971:   if (d_mat) {
2972:     A->offloadmask = PETSC_OFFLOAD_GPU;
2973:   }

2975:   return(0);
2976: }

2978: /* --------------------------------------------------------------------------------*/
2979: /*@
2980:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2981:    (the default parallel PETSc format). This matrix will ultimately pushed down
2982:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2983:    assembly performance the user should preallocate the matrix storage by setting
2984:    the parameter nz (or the array nnz).  By setting these parameters accurately,
2985:    performance during matrix assembly can be increased by more than a factor of 50.

2987:    Collective

2989:    Input Parameters:
2990: +  comm - MPI communicator, set to PETSC_COMM_SELF
2991: .  m - number of rows
2992: .  n - number of columns
2993: .  nz - number of nonzeros per row (same for all rows)
2994: -  nnz - array containing the number of nonzeros in the various rows
2995:          (possibly different for each row) or NULL

2997:    Output Parameter:
2998: .  A - the matrix

3000:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3001:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3002:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

3004:    Notes:
3005:    If nnz is given then nz is ignored

3007:    The AIJ format (also called the Yale sparse matrix format or
3008:    compressed row storage), is fully compatible with standard Fortran 77
3009:    storage.  That is, the stored row and column indices can begin at
3010:    either one (as in Fortran) or zero.  See the users' manual for details.

3012:    Specify the preallocated storage with either nz or nnz (not both).
3013:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3014:    allocation.  For large problems you MUST preallocate memory or you
3015:    will get TERRIBLE performance, see the users' manual chapter on matrices.

3017:    By default, this format uses inodes (identical nodes) when possible, to
3018:    improve numerical efficiency of matrix-vector products and solves. We
3019:    search for consecutive rows with the same nonzero structure, thereby
3020:    reusing matrix information to achieve increased efficiency.

3022:    Level: intermediate

3024: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3025: @*/
3026: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3027: {

3031:   MatCreate(comm,A);
3032:   MatSetSizes(*A,m,n,m,n);
3033:   MatSetType(*A,MATSEQAIJCUSPARSE);
3034:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3035:   return(0);
3036: }

3038: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3039: {
3040:   PetscErrorCode              ierr;
3041:   PetscSplitCSRDataStructure  *d_mat = NULL;

3044:   if (A->factortype == MAT_FACTOR_NONE) {
3045:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3046:     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3047:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3048:   } else {
3049:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3050:   }
3051:   if (d_mat) {
3052:     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3053:     cudaError_t                err;
3054:     PetscSplitCSRDataStructure h_mat;
3055:     PetscInfo(A,"Have device matrix\n");
3056:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3057:     if (a->compressedrow.use) {
3058:       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3059:     }
3060:     err = cudaFree(d_mat);CHKERRCUDA(err);
3061:   }
3062:   PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3063:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3064:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3065:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3066:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3067:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3068:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3069:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3070:   MatDestroy_SeqAIJ(A);
3071:   return(0);
3072: }

3074: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3075: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3076: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3077: {

3081:   MatDuplicate_SeqAIJ(A,cpvalues,B);
3082:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3083:   return(0);
3084: }

3086: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3087: {
3088:   PetscErrorCode     ierr;
3089:   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3090:   Mat_SeqAIJCUSPARSE *cy;
3091:   Mat_SeqAIJCUSPARSE *cx;
3092:   PetscScalar        *ay;
3093:   const PetscScalar  *ax;
3094:   CsrMatrix          *csry,*csrx;
3095:   cudaError_t        cerr;

3098:   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3099:   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3100:   if (X->ops->axpy != Y->ops->axpy) {
3101:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3102:     MatAXPY_SeqAIJ(Y,a,X,str);
3103:     return(0);
3104:   }
3105:   /* if we are here, it means both matrices are bound to GPU */
3106:   MatSeqAIJCUSPARSECopyToGPU(Y);
3107:   MatSeqAIJCUSPARSECopyToGPU(X);
3108:   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3109:   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3110:   csry = (CsrMatrix*)cy->mat->mat;
3111:   csrx = (CsrMatrix*)cx->mat->mat;
3112:   /* see if we can turn this into a cublas axpy */
3113:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3114:     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3115:     if (eq) {
3116:       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3117:     }
3118:     if (eq) str = SAME_NONZERO_PATTERN;
3119:   }
3120:   /* spgeam is buggy with one column */
3121:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3123:   if (str == SUBSET_NONZERO_PATTERN) {
3124:     cusparseStatus_t stat;
3125:     PetscScalar      b = 1.0;
3126: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3127:     size_t           bufferSize;
3128:     void             *buffer;
3129: #endif

3131:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3132:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3133:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3134: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3135:     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3136:                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3137:                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3138:                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3139:     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3140:     PetscLogGpuTimeBegin();
3141:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3142:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3143:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3144:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3145:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3146:     PetscLogGpuFlops(x->nz + y->nz);
3147:     PetscLogGpuTimeEnd();
3148:     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3149: #else
3150:     PetscLogGpuTimeBegin();
3151:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3152:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3153:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3154:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3155:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3156:     PetscLogGpuFlops(x->nz + y->nz);
3157:     PetscLogGpuTimeEnd();
3158: #endif
3159:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3160:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3161:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3162:     MatSeqAIJInvalidateDiagonal(Y);
3163:   } else if (str == SAME_NONZERO_PATTERN) {
3164:     cublasHandle_t cublasv2handle;
3165:     cublasStatus_t berr;
3166:     PetscBLASInt   one = 1, bnz = 1;

3168:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3169:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3170:     PetscCUBLASGetHandle(&cublasv2handle);
3171:     PetscBLASIntCast(x->nz,&bnz);
3172:     PetscLogGpuTimeBegin();
3173:     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3174:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3175:     PetscLogGpuFlops(2.0*bnz);
3176:     PetscLogGpuTimeEnd();
3177:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3178:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3179:     MatSeqAIJInvalidateDiagonal(Y);
3180:   } else {
3181:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3182:     MatAXPY_SeqAIJ(Y,a,X,str);
3183:   }
3184:   return(0);
3185: }

3187: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3188: {
3190:   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3191:   PetscScalar    *ay;
3192:   cudaError_t    cerr;
3193:   cublasHandle_t cublasv2handle;
3194:   cublasStatus_t berr;
3195:   PetscBLASInt   one = 1, bnz = 1;

3198:   MatSeqAIJCUSPARSEGetArray(Y,&ay);
3199:   PetscCUBLASGetHandle(&cublasv2handle);
3200:   PetscBLASIntCast(y->nz,&bnz);
3201:   PetscLogGpuTimeBegin();
3202:   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3203:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3204:   PetscLogGpuFlops(bnz);
3205:   PetscLogGpuTimeEnd();
3206:   MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3207:   MatSeqAIJInvalidateDiagonal(Y);
3208:   return(0);
3209: }

3211: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3212: {
3213:   PetscErrorCode             ierr;
3214:   PetscBool                  both = PETSC_FALSE;
3215:   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;

3218:   if (A->factortype == MAT_FACTOR_NONE) {
3219:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3220:     if (spptr->mat) {
3221:       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3222:       if (matrix->values) {
3223:         both = PETSC_TRUE;
3224:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3225:       }
3226:     }
3227:     if (spptr->matTranspose) {
3228:       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3229:       if (matrix->values) {
3230:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3231:       }
3232:     }
3233:   }
3234:   //MatZeroEntries_SeqAIJ(A);
3235:   PetscArrayzero(a->a,a->i[A->rmap->n]);
3236:   MatSeqAIJInvalidateDiagonal(A);
3237:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3238:   else A->offloadmask = PETSC_OFFLOAD_CPU;

3240:   return(0);
3241: }

3243: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3244: {
3245:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

3249:   if (A->factortype != MAT_FACTOR_NONE) return(0);
3250:   if (flg) {
3251:     MatSeqAIJCUSPARSECopyFromGPU(A);

3253:     A->ops->scale                     = MatScale_SeqAIJ;
3254:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3255:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3256:     A->ops->mult                      = MatMult_SeqAIJ;
3257:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3258:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3259:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3260:     A->ops->multhermitiantranspose    = NULL;
3261:     A->ops->multhermitiantransposeadd = NULL;
3262:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3263:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3264:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3265:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3266:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3267:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3268:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3269:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3270:   } else {
3271:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3272:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3273:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3274:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3275:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3276:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3277:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3278:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3279:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3280:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3281:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3282:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3283:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3284:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3285:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3286:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
3287:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3288:   }
3289:   A->boundtocpu = flg;
3290:   a->inode.use = flg;
3291:   return(0);
3292: }

3294: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3295: {
3296:   PetscErrorCode   ierr;
3297:   cusparseStatus_t stat;
3298:   Mat              B;

3301:   PetscCUDAInitializeCheck(); /* first use of CUSPARSE may be via MatConvert */
3302:   if (reuse == MAT_INITIAL_MATRIX) {
3303:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
3304:   } else if (reuse == MAT_REUSE_MATRIX) {
3305:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3306:   }
3307:   B = *newmat;

3309:   PetscFree(B->defaultvectype);
3310:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

3312:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3313:     if (B->factortype == MAT_FACTOR_NONE) {
3314:       Mat_SeqAIJCUSPARSE *spptr;
3315:       PetscNew(&spptr);
3316:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3317:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3318:       spptr->format     = MAT_CUSPARSE_CSR;
3319:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3320:       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3321:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3322:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3323:      #endif
3324:       B->spptr = spptr;
3325:     } else {
3326:       Mat_SeqAIJCUSPARSETriFactors *spptr;

3328:       PetscNew(&spptr);
3329:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3330:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3331:       B->spptr = spptr;
3332:     }
3333:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3334:   }
3335:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3336:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3337:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3338:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3339:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3340:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

3342:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3343:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3344:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3345:   return(0);
3346: }

3348: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3349: {

3353:   MatCreate_SeqAIJ(B);
3354:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3355:   return(0);
3356: }

3358: /*MC
3359:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

3361:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3362:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3363:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

3365:    Options Database Keys:
3366: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3367: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3368: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

3370:   Level: beginner

3372: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3373: M*/

3375: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);

3377: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3378: {

3382:   MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3383:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3384:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3385:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3386:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);

3388:   return(0);
3389: }

3391: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3392: {
3393:   PetscErrorCode   ierr;
3394:   cusparseStatus_t stat;

3397:   if (*cusparsestruct) {
3398:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3399:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3400:     delete (*cusparsestruct)->workVector;
3401:     delete (*cusparsestruct)->rowoffsets_gpu;
3402:     delete (*cusparsestruct)->cooPerm;
3403:     delete (*cusparsestruct)->cooPerm_a;
3404:     delete (*cusparsestruct)->csr2csc_i;
3405:     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3406:     PetscFree(*cusparsestruct);
3407:   }
3408:   return(0);
3409: }

3411: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3412: {
3414:   if (*mat) {
3415:     delete (*mat)->values;
3416:     delete (*mat)->column_indices;
3417:     delete (*mat)->row_offsets;
3418:     delete *mat;
3419:     *mat = 0;
3420:   }
3421:   return(0);
3422: }

3424: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3425: {
3426:   cusparseStatus_t stat;
3427:   PetscErrorCode   ierr;

3430:   if (*trifactor) {
3431:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3432:     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3433:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
3434:     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3435:     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3436:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3437:     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3438:    #endif
3439:     PetscFree(*trifactor);
3440:   }
3441:   return(0);
3442: }

3444: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3445: {
3446:   CsrMatrix        *mat;
3447:   cusparseStatus_t stat;
3448:   cudaError_t      err;

3451:   if (*matstruct) {
3452:     if ((*matstruct)->mat) {
3453:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3454:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3455:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3456:        #else
3457:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3458:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3459:        #endif
3460:       } else {
3461:         mat = (CsrMatrix*)(*matstruct)->mat;
3462:         CsrMatrix_Destroy(&mat);
3463:       }
3464:     }
3465:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3466:     delete (*matstruct)->cprowIndices;
3467:     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3468:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3469:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }

3471:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3472:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3473:     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3474:     for (int i=0; i<3; i++) {
3475:       if (mdata->cuSpMV[i].initialized) {
3476:         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3477:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3478:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3479:       }
3480:     }
3481:    #endif
3482:     delete *matstruct;
3483:     *matstruct = NULL;
3484:   }
3485:   return(0);
3486: }

3488: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3489: {

3493:   if (*trifactors) {
3494:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3495:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3496:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3497:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3498:     delete (*trifactors)->rpermIndices;
3499:     delete (*trifactors)->cpermIndices;
3500:     delete (*trifactors)->workVector;
3501:     (*trifactors)->rpermIndices = NULL;
3502:     (*trifactors)->cpermIndices = NULL;
3503:     (*trifactors)->workVector = NULL;
3504:     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3505:     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3506:     (*trifactors)->init_dev_prop = PETSC_FALSE;
3507:   }
3508:   return(0);
3509: }

3511: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3512: {
3513:   PetscErrorCode   ierr;
3514:   cusparseHandle_t handle;
3515:   cusparseStatus_t stat;

3518:   if (*trifactors) {
3519:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3520:     if (handle = (*trifactors)->handle) {
3521:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3522:     }
3523:     PetscFree(*trifactors);
3524:   }
3525:   return(0);
3526: }

3528: struct IJCompare
3529: {
3530:   __host__ __device__
3531:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3532:   {
3533:     if (t1.get<0>() < t2.get<0>()) return true;
3534:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3535:     return false;
3536:   }
3537: };

3539: struct IJEqual
3540: {
3541:   __host__ __device__
3542:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3543:   {
3544:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3545:     return true;
3546:   }
3547: };

3549: struct IJDiff
3550: {
3551:   __host__ __device__
3552:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3553:   {
3554:     return t1 == t2 ? 0 : 1;
3555:   }
3556: };

3558: struct IJSum
3559: {
3560:   __host__ __device__
3561:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3562:   {
3563:     return t1||t2;
3564:   }
3565: };

3567: #include <thrust/iterator/discard_iterator.h>
3568: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3569: {
3570:   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3571:   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3572:   THRUSTARRAY                           *cooPerm_v = NULL;
3573:   thrust::device_ptr<const PetscScalar> d_v;
3574:   CsrMatrix                             *matrix;
3575:   PetscErrorCode                        ierr;
3576:   cudaError_t                           cerr;
3577:   PetscInt                              n;

3580:   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3581:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3582:   if (!cusp->cooPerm) {
3583:     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3584:     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3585:     return(0);
3586:   }
3587:   matrix = (CsrMatrix*)cusp->mat->mat;
3588:   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3589:   if (!v) {
3590:     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3591:     goto finalize;
3592:   }
3593:   n = cusp->cooPerm->size();
3594:   if (isCudaMem(v)) {
3595:     d_v = thrust::device_pointer_cast(v);
3596:   } else {
3597:     cooPerm_v = new THRUSTARRAY(n);
3598:     cooPerm_v->assign(v,v+n);
3599:     d_v = cooPerm_v->data();
3600:     PetscLogCpuToGpu(n*sizeof(PetscScalar));
3601:   }
3602:   PetscLogGpuTimeBegin();
3603:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3604:     if (cusp->cooPerm_a) {
3605:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3606:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3607:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3608:       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3609:       delete cooPerm_w;
3610:     } else {
3611:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3612:                                                                 matrix->values->begin()));
3613:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3614:                                                                 matrix->values->end()));
3615:       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3616:     }
3617:   } else {
3618:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3619:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3620:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3621:     } else {
3622:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3623:                                                                 matrix->values->begin()));
3624:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3625:                                                                 matrix->values->end()));
3626:       thrust::for_each(zibit,zieit,VecCUDAEquals());
3627:     }
3628:   }
3629:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3630:   PetscLogGpuTimeEnd();
3631: finalize:
3632:   delete cooPerm_v;
3633:   A->offloadmask = PETSC_OFFLOAD_GPU;
3634:   PetscObjectStateIncrease((PetscObject)A);
3635:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3636:   PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);
3637:   PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3638:   PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);
3639:   a->reallocs         = 0;
3640:   A->info.mallocs    += 0;
3641:   A->info.nz_unneeded = 0;
3642:   A->assembled = A->was_assembled = PETSC_TRUE;
3643:   A->num_ass++;
3644:   return(0);
3645: }

3647: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3648: {
3649:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3650:   PetscErrorCode     ierr;

3654:   if (!cusp) return(0);
3655:   if (destroy) {
3656:     MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3657:     delete cusp->csr2csc_i;
3658:     cusp->csr2csc_i = NULL;
3659:   }
3660:   A->transupdated = PETSC_FALSE;
3661:   return(0);
3662: }

3664: #include <thrust/binary_search.h>
3665: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3666: {
3667:   PetscErrorCode     ierr;
3668:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3669:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3670:   PetscInt           cooPerm_n, nzr = 0;
3671:   cudaError_t        cerr;

3674:   PetscLayoutSetUp(A->rmap);
3675:   PetscLayoutSetUp(A->cmap);
3676:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3677:   if (n != cooPerm_n) {
3678:     delete cusp->cooPerm;
3679:     delete cusp->cooPerm_a;
3680:     cusp->cooPerm = NULL;
3681:     cusp->cooPerm_a = NULL;
3682:   }
3683:   if (n) {
3684:     THRUSTINTARRAY d_i(n);
3685:     THRUSTINTARRAY d_j(n);
3686:     THRUSTINTARRAY ii(A->rmap->n);

3688:     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3689:     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }

3691:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3692:     d_i.assign(coo_i,coo_i+n);
3693:     d_j.assign(coo_j,coo_j+n);
3694:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3695:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));

3697:     PetscLogGpuTimeBegin();
3698:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3699:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3700:     *cusp->cooPerm_a = d_i;
3701:     THRUSTINTARRAY w = d_j;

3703:     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3704:     if (nekey == ekey) { /* all entries are unique */
3705:       delete cusp->cooPerm_a;
3706:       cusp->cooPerm_a = NULL;
3707:     } else { /* I couldn't come up with a more elegant algorithm */
3708:       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3709:       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3710:       (*cusp->cooPerm_a)[0] = 0;
3711:       w[0] = 0;
3712:       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3713:       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3714:     }
3715:     thrust::counting_iterator<PetscInt> search_begin(0);
3716:     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3717:                         search_begin, search_begin + A->rmap->n,
3718:                         ii.begin());
3719:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3720:     PetscLogGpuTimeEnd();

3722:     MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3723:     a->singlemalloc = PETSC_FALSE;
3724:     a->free_a       = PETSC_TRUE;
3725:     a->free_ij      = PETSC_TRUE;
3726:     PetscMalloc1(A->rmap->n+1,&a->i);
3727:     a->i[0] = 0;
3728:     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3729:     a->nz = a->maxnz = a->i[A->rmap->n];
3730:     a->rmax = 0;
3731:     PetscMalloc1(a->nz,&a->a);
3732:     PetscMalloc1(a->nz,&a->j);
3733:     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3734:     if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3735:     if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3736:     for (PetscInt i = 0; i < A->rmap->n; i++) {
3737:       const PetscInt nnzr = a->i[i+1] - a->i[i];
3738:       nzr += (PetscInt)!!(nnzr);
3739:       a->ilen[i] = a->imax[i] = nnzr;
3740:       a->rmax = PetscMax(a->rmax,nnzr);
3741:     }
3742:     a->nonzerorowcnt = nzr;
3743:     A->preallocated = PETSC_TRUE;
3744:     PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3745:     MatMarkDiagonal_SeqAIJ(A);
3746:   } else {
3747:     MatSeqAIJSetPreallocation(A,0,NULL);
3748:   }
3749:   MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);

3751:   /* We want to allocate the CUSPARSE struct for matvec now.
3752:      The code is so convoluted now that I prefer to copy zeros */
3753:   PetscArrayzero(a->a,a->nz);
3754:   MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3755:   A->offloadmask = PETSC_OFFLOAD_CPU;
3756:   A->nonzerostate++;
3757:   MatSeqAIJCUSPARSECopyToGPU(A);
3758:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);

3760:   A->assembled = PETSC_FALSE;
3761:   A->was_assembled = PETSC_FALSE;
3762:   return(0);
3763: }

3765: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3766: {
3767:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3768:   CsrMatrix          *csr;
3769:   PetscErrorCode     ierr;

3775:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3776:   MatSeqAIJCUSPARSECopyToGPU(A);
3777:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3778:   csr = (CsrMatrix*)cusp->mat->mat;
3779:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3780:   *a = csr->values->data().get();
3781:   return(0);
3782: }

3784: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3785: {
3790:   *a = NULL;
3791:   return(0);
3792: }

3794: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3795: {
3796:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3797:   CsrMatrix          *csr;
3798:   PetscErrorCode     ierr;

3804:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3805:   MatSeqAIJCUSPARSECopyToGPU(A);
3806:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3807:   csr = (CsrMatrix*)cusp->mat->mat;
3808:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3809:   *a = csr->values->data().get();
3810:   A->offloadmask = PETSC_OFFLOAD_GPU;
3811:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3812:   return(0);
3813: }

3815: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3816: {

3823:   PetscObjectStateIncrease((PetscObject)A);
3824:   *a = NULL;
3825:   return(0);
3826: }

3828: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3829: {
3830:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3831:   CsrMatrix          *csr;
3832:   PetscErrorCode     ierr;

3838:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3839:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3840:   csr = (CsrMatrix*)cusp->mat->mat;
3841:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3842:   *a = csr->values->data().get();
3843:   A->offloadmask = PETSC_OFFLOAD_GPU;
3844:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3845:   return(0);
3846: }

3848: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3849: {

3856:   PetscObjectStateIncrease((PetscObject)A);
3857:   *a = NULL;
3858:   return(0);
3859: }

3861: struct IJCompare4
3862: {
3863:   __host__ __device__
3864:   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3865:   {
3866:     if (t1.get<0>() < t2.get<0>()) return true;
3867:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3868:     return false;
3869:   }
3870: };

3872: struct Shift
3873: {
3874:   int _shift;

3876:   Shift(int shift) : _shift(shift) {}
3877:   __host__ __device__
3878:   inline int operator() (const int &c)
3879:   {
3880:     return c + _shift;
3881:   }
3882: };

3884: /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3885: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3886: {
3887:   PetscErrorCode               ierr;
3888:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3889:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3890:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3891:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3892:   PetscInt                     Annz,Bnnz;
3893:   cusparseStatus_t             stat;
3894:   PetscInt                     i,m,n,zero = 0;
3895:   cudaError_t                  cerr;

3903:   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3904:   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3905:   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3906:   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3907:   if (reuse == MAT_INITIAL_MATRIX) {
3908:     m     = A->rmap->n;
3909:     n     = A->cmap->n + B->cmap->n;
3910:     MatCreate(PETSC_COMM_SELF,C);
3911:     MatSetSizes(*C,m,n,m,n);
3912:     MatSetType(*C,MATSEQAIJCUSPARSE);
3913:     c     = (Mat_SeqAIJ*)(*C)->data;
3914:     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3915:     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3916:     Ccsr  = new CsrMatrix;
3917:     Cmat->cprowIndices      = NULL;
3918:     c->compressedrow.use    = PETSC_FALSE;
3919:     c->compressedrow.nrows  = 0;
3920:     c->compressedrow.i      = NULL;
3921:     c->compressedrow.rindex = NULL;
3922:     Ccusp->workVector       = NULL;
3923:     Ccusp->nrows    = m;
3924:     Ccusp->mat      = Cmat;
3925:     Ccusp->mat->mat = Ccsr;
3926:     Ccsr->num_rows  = m;
3927:     Ccsr->num_cols  = n;
3928:     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3929:     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3930:     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3931:     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3932:     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3933:     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3934:     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3935:     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3936:     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3937:     MatSeqAIJCUSPARSECopyToGPU(A);
3938:     MatSeqAIJCUSPARSECopyToGPU(B);
3939:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
3940:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
3941:     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3942:     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");

3944:     Acsr = (CsrMatrix*)Acusp->mat->mat;
3945:     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3946:     Annz = (PetscInt)Acsr->column_indices->size();
3947:     Bnnz = (PetscInt)Bcsr->column_indices->size();
3948:     c->nz = Annz + Bnnz;
3949:     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3950:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3951:     Ccsr->values = new THRUSTARRAY(c->nz);
3952:     Ccsr->num_entries = c->nz;
3953:     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3954:     if (c->nz) {
3955:       auto Acoo = new THRUSTINTARRAY32(Annz);
3956:       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3957:       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3958:       THRUSTINTARRAY32 *Aroff,*Broff;

3960:       if (a->compressedrow.use) { /* need full row offset */
3961:         if (!Acusp->rowoffsets_gpu) {
3962:           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3963:           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3964:           PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
3965:         }
3966:         Aroff = Acusp->rowoffsets_gpu;
3967:       } else Aroff = Acsr->row_offsets;
3968:       if (b->compressedrow.use) { /* need full row offset */
3969:         if (!Bcusp->rowoffsets_gpu) {
3970:           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3971:           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3972:           PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
3973:         }
3974:         Broff = Bcusp->rowoffsets_gpu;
3975:       } else Broff = Bcsr->row_offsets;
3976:       PetscLogGpuTimeBegin();
3977:       stat = cusparseXcsr2coo(Acusp->handle,
3978:                               Aroff->data().get(),
3979:                               Annz,
3980:                               m,
3981:                               Acoo->data().get(),
3982:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3983:       stat = cusparseXcsr2coo(Bcusp->handle,
3984:                               Broff->data().get(),
3985:                               Bnnz,
3986:                               m,
3987:                               Bcoo->data().get(),
3988:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3989:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3990:       auto Aperm = thrust::make_constant_iterator(1);
3991:       auto Bperm = thrust::make_constant_iterator(0);
3992: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3993:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3994:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3995: #else
3996:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3997:       auto Bcib = Bcsr->column_indices->begin();
3998:       auto Bcie = Bcsr->column_indices->end();
3999:       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4000: #endif
4001:       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4002:       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4003:       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4004:       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4005:       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4006:       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4007:       auto p1 = Ccusp->cooPerm->begin();
4008:       auto p2 = Ccusp->cooPerm->begin();
4009:       thrust::advance(p2,Annz);
4010:       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4011: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4012:       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4013: #endif
4014:       auto cci = thrust::make_counting_iterator(zero);
4015:       auto cce = thrust::make_counting_iterator(c->nz);
4016: #if 0 //Errors on SUMMIT cuda 11.1.0
4017:       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4018: #else
4019:       auto pred = thrust::identity<int>();
4020:       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4021:       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4022: #endif
4023:       stat = cusparseXcoo2csr(Ccusp->handle,
4024:                               Ccoo->data().get(),
4025:                               c->nz,
4026:                               m,
4027:                               Ccsr->row_offsets->data().get(),
4028:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4029:       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4030:       PetscLogGpuTimeEnd();
4031:       delete wPerm;
4032:       delete Acoo;
4033:       delete Bcoo;
4034:       delete Ccoo;
4035: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4036:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4037:                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4038:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4039:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4040: #endif
4041:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4042:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4043:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4044:         CsrMatrix *CcsrT = new CsrMatrix;
4045:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4046:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;

4048:         (*C)->form_explicit_transpose = PETSC_TRUE;
4049:         (*C)->transupdated = PETSC_TRUE;
4050:         Ccusp->rowoffsets_gpu = NULL;
4051:         CmatT->cprowIndices = NULL;
4052:         CmatT->mat = CcsrT;
4053:         CcsrT->num_rows = n;
4054:         CcsrT->num_cols = m;
4055:         CcsrT->num_entries = c->nz;

4057:         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4058:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4059:         CcsrT->values = new THRUSTARRAY(c->nz);

4061:         PetscLogGpuTimeBegin();
4062:         auto rT = CcsrT->row_offsets->begin();
4063:         if (AT) {
4064:           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4065:           thrust::advance(rT,-1);
4066:         }
4067:         if (BT) {
4068:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4069:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4070:           thrust::copy(titb,tite,rT);
4071:         }
4072:         auto cT = CcsrT->column_indices->begin();
4073:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4074:         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4075:         auto vT = CcsrT->values->begin();
4076:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4077:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4078:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4079:         PetscLogGpuTimeEnd();

4081:         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4082:         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4083:         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4084:         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4085:         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4086:         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4087:         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4088:         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4089:         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4090: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4091:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4092:                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4093:                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4094:                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4095: #endif
4096:         Ccusp->matTranspose = CmatT;
4097:       }
4098:     }

4100:     c->singlemalloc = PETSC_FALSE;
4101:     c->free_a       = PETSC_TRUE;
4102:     c->free_ij      = PETSC_TRUE;
4103:     PetscMalloc1(m+1,&c->i);
4104:     PetscMalloc1(c->nz,&c->j);
4105:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4106:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4107:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4108:       ii   = *Ccsr->row_offsets;
4109:       jj   = *Ccsr->column_indices;
4110:       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4111:       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4112:     } else {
4113:       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4114:       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4115:     }
4116:     PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4117:     PetscMalloc1(m,&c->ilen);
4118:     PetscMalloc1(m,&c->imax);
4119:     c->maxnz = c->nz;
4120:     c->nonzerorowcnt = 0;
4121:     c->rmax = 0;
4122:     for (i = 0; i < m; i++) {
4123:       const PetscInt nn = c->i[i+1] - c->i[i];
4124:       c->ilen[i] = c->imax[i] = nn;
4125:       c->nonzerorowcnt += (PetscInt)!!nn;
4126:       c->rmax = PetscMax(c->rmax,nn);
4127:     }
4128:     MatMarkDiagonal_SeqAIJ(*C);
4129:     PetscMalloc1(c->nz,&c->a);
4130:     (*C)->nonzerostate++;
4131:     PetscLayoutSetUp((*C)->rmap);
4132:     PetscLayoutSetUp((*C)->cmap);
4133:     Ccusp->nonzerostate = (*C)->nonzerostate;
4134:     (*C)->preallocated  = PETSC_TRUE;
4135:   } else {
4136:     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4137:     c = (Mat_SeqAIJ*)(*C)->data;
4138:     if (c->nz) {
4139:       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4140:       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4141:       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4142:       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4143:       MatSeqAIJCUSPARSECopyToGPU(A);
4144:       MatSeqAIJCUSPARSECopyToGPU(B);
4145:       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4146:       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4147:       Acsr = (CsrMatrix*)Acusp->mat->mat;
4148:       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4149:       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4150:       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4151:       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4152:       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4153:       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4154:       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4155:       auto pmid = Ccusp->cooPerm->begin();
4156:       thrust::advance(pmid,Acsr->num_entries);
4157:       PetscLogGpuTimeBegin();
4158:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4159:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4160:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4161:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4162:       thrust::for_each(zibait,zieait,VecCUDAEquals());
4163:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4164:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4165:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4166:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4167:       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4168:       MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4169:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4170:         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4171:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4172:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4173:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4174:         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4175:         auto vT = CcsrT->values->begin();
4176:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4177:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4178:         (*C)->transupdated = PETSC_TRUE;
4179:       }
4180:       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4181:       PetscLogGpuTimeEnd();
4182:     }
4183:   }
4184:   PetscObjectStateIncrease((PetscObject)*C);
4185:   (*C)->assembled     = PETSC_TRUE;
4186:   (*C)->was_assembled = PETSC_FALSE;
4187:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4188:   return(0);
4189: }

4191: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4192: {
4193:   PetscErrorCode    ierr;
4194:   bool              dmem;
4195:   const PetscScalar *av;
4196:   cudaError_t       cerr;

4199:   dmem = isCudaMem(v);
4200:   MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4201:   if (n && idx) {
4202:     THRUSTINTARRAY widx(n);
4203:     widx.assign(idx,idx+n);
4204:     PetscLogCpuToGpu(n*sizeof(PetscInt));

4206:     THRUSTARRAY *w = NULL;
4207:     thrust::device_ptr<PetscScalar> dv;
4208:     if (dmem) {
4209:       dv = thrust::device_pointer_cast(v);
4210:     } else {
4211:       w = new THRUSTARRAY(n);
4212:       dv = w->data();
4213:     }
4214:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4216:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4217:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4218:     thrust::for_each(zibit,zieit,VecCUDAEquals());
4219:     if (w) {
4220:       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4221:     }
4222:     delete w;
4223:   } else {
4224:     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4225:   }
4226:   if (!dmem) { PetscLogCpuToGpu(n*sizeof(PetscScalar)); }
4227:   MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4228:   return(0);
4229: }