Actual source code: aijcusparse.cu

petsc-3.15.0 2021-04-05
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10: #include <../src/mat/impls/aij/seq/aij.h>
 11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12: #include <../src/vec/vec/impls/dvecimpl.h>
 13: #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 16: #include <thrust/async/for_each.h>
 17: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 18: #include <cooperative_groups.h>
 19: #endif
 20: const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
 21: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 22:   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 23:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 25:   typedef enum {
 26:       CUSPARSE_MV_ALG_DEFAULT = 0,
 27:       CUSPARSE_COOMV_ALG      = 1,
 28:       CUSPARSE_CSRMV_ALG1     = 2,
 29:       CUSPARSE_CSRMV_ALG2     = 3
 30:   } cusparseSpMVAlg_t;

 32:   typedef enum {
 33:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 34:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 35:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 36:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 37:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 38:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 39:       CUSPARSE_SPMM_COO_ALG1    = 1,
 40:       CUSPARSE_SPMM_COO_ALG2    = 2,
 41:       CUSPARSE_SPMM_COO_ALG3    = 3,
 42:       CUSPARSE_SPMM_COO_ALG4    = 5,
 43:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 44:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 45:   } cusparseSpMMAlg_t;

 47:   typedef enum {
 48:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 49:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 50:   } cusparseCsr2CscAlg_t;
 51:   */
 52:   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
 53:   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
 54:   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
 55: #endif

 57: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 58: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 59: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 61: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
 62: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
 63: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 64: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 65: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 67: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 68: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 69: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 71: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 72: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
 73: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
 74: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 75: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 76: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 77: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 78: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 79: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 80: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 82: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 83: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 84: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 85: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
 86: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 87: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 89: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
 90: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 91: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);

 93: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
 94: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);

 96: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);

 98: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 99: {
100:   cusparseStatus_t   stat;
101:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

104:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105:   cusparsestruct->stream = stream;
106:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
107:   return(0);
108: }

110: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
111: {
112:   cusparseStatus_t   stat;
113:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

116:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117:   if (cusparsestruct->handle != handle) {
118:     if (cusparsestruct->handle) {
119:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
120:     }
121:     cusparsestruct->handle = handle;
122:   }
123:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
124:   return(0);
125: }

127: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
128: {
129:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
130:   PetscBool          flg;
131:   PetscErrorCode     ierr;

134:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
135:   if (!flg || !cusparsestruct) return(0);
136:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
137:   return(0);
138: }

140: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
141: {
143:   *type = MATSOLVERCUSPARSE;
144:   return(0);
145: }

147: /*MC
148:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
149:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
150:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
151:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
152:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
153:   algorithms are not recommended. This class does NOT support direct solver operations.

155:   Level: beginner

157: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158: M*/

160: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
161: {
163:   PetscInt       n = A->rmap->n;

166:   MatCreate(PetscObjectComm((PetscObject)A),B);
167:   MatSetSizes(*B,n,n,n,n);
168:   (*B)->factortype = ftype;
169:   (*B)->useordering = PETSC_TRUE;
170:   MatSetType(*B,MATSEQAIJCUSPARSE);

172:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
173:     MatSetBlockSizesFromMats(*B,A,A);
174:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
175:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
176:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
177:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
178:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
179:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

181:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
182:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
183:   return(0);
184: }

186: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187: {
188:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

191:   switch (op) {
192:   case MAT_CUSPARSE_MULT:
193:     cusparsestruct->format = format;
194:     break;
195:   case MAT_CUSPARSE_ALL:
196:     cusparsestruct->format = format;
197:     break;
198:   default:
199:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200:   }
201:   return(0);
202: }

204: /*@
205:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206:    operation. Only the MatMult operation can use different GPU storage formats
207:    for MPIAIJCUSPARSE matrices.
208:    Not Collective

210:    Input Parameters:
211: +  A - Matrix of type SEQAIJCUSPARSE
212: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
213: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

215:    Output Parameter:

217:    Level: intermediate

219: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220: @*/
221: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222: {

227:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
228:   return(0);
229: }

231: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232: {

236:   switch (op) {
237:     case MAT_FORM_EXPLICIT_TRANSPOSE:
238:       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
239:       if (A->form_explicit_transpose && !flg) {MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);}
240:       A->form_explicit_transpose = flg;
241:       break;
242:     default:
243:       MatSetOption_SeqAIJ(A,op,flg);
244:       break;
245:   }
246:   return(0);
247: }

249: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);

251: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252: {
253:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
254:   IS             isrow = b->row,iscol = b->col;
255:   PetscBool      row_identity,col_identity;

259:   MatSeqAIJCUSPARSECopyFromGPU(A);
260:   MatLUFactorNumeric_SeqAIJ(B,A,info);
261:   B->offloadmask = PETSC_OFFLOAD_CPU;
262:   /* determine which version of MatSolve needs to be used. */
263:   ISIdentity(isrow,&row_identity);
264:   ISIdentity(iscol,&col_identity);
265:   if (row_identity && col_identity) {
266:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268:     B->ops->matsolve = NULL;
269:     B->ops->matsolvetranspose = NULL;
270:   } else {
271:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273:     B->ops->matsolve = NULL;
274:     B->ops->matsolvetranspose = NULL;
275:   }

277:   /* get the triangular factors */
278:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
279:   return(0);
280: }

282: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
283: {
284:   PetscErrorCode           ierr;
285:   MatCUSPARSEStorageFormat format;
286:   PetscBool                flg;
287:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

290:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
291:   if (A->factortype == MAT_FACTOR_NONE) {
292:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
294:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}

296:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
298:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
299:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300:     PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301:                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
302:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303:     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");

305:     PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306:                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
307:     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

309:     PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310:                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
311:     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312:    #endif
313:   }
314:   PetscOptionsTail();
315:   return(0);
316: }

318: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
319: {
320:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
321:   PetscErrorCode               ierr;

324:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
325:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
326:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
327:   return(0);
328: }

330: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331: {
332:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333:   PetscErrorCode               ierr;

336:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
337:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
338:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
339:   return(0);
340: }

342: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343: {
344:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345:   PetscErrorCode               ierr;

348:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
349:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
350:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351:   return(0);
352: }

354: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355: {
356:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357:   PetscErrorCode               ierr;

360:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
361:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
362:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363:   return(0);
364: }

366: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
367: {
368:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369:   PetscInt                          n = A->rmap->n;
370:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
372:   cusparseStatus_t                  stat;
373:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
374:   const MatScalar                   *aa = a->a,*v;
375:   PetscInt                          *AiLo, *AjLo;
376:   PetscInt                          i,nz, nzLower, offset, rowOffset;
377:   PetscErrorCode                    ierr;
378:   cudaError_t                       cerr;

381:   if (!n) return(0);
382:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
383:     try {
384:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
385:       nzLower=n+ai[n]-ai[1];
386:       if (!loTriFactor) {
387:         PetscScalar                       *AALo;

389:         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

391:         /* Allocate Space for the lower triangular matrix */
392:         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
393:         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);

395:         /* Fill the lower triangular matrix */
396:         AiLo[0]  = (PetscInt) 0;
397:         AiLo[n]  = nzLower;
398:         AjLo[0]  = (PetscInt) 0;
399:         AALo[0]  = (MatScalar) 1.0;
400:         v        = aa;
401:         vi       = aj;
402:         offset   = 1;
403:         rowOffset= 1;
404:         for (i=1; i<n; i++) {
405:           nz = ai[i+1] - ai[i];
406:           /* additional 1 for the term on the diagonal */
407:           AiLo[i]    = rowOffset;
408:           rowOffset += nz+1;

410:           PetscArraycpy(&(AjLo[offset]), vi, nz);
411:           PetscArraycpy(&(AALo[offset]), v, nz);

413:           offset      += nz;
414:           AjLo[offset] = (PetscInt) i;
415:           AALo[offset] = (MatScalar) 1.0;
416:           offset      += 1;

418:           v  += nz;
419:           vi += nz;
420:         }

422:         /* allocate space for the triangular factor information */
423:         PetscNew(&loTriFactor);
424:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425:         /* Create the matrix description */
426:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
427:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
428:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430:        #else
431:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432:        #endif
433:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
434:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

436:         /* set the operation */
437:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

439:         /* set the matrix */
440:         loTriFactor->csrMat = new CsrMatrix;
441:         loTriFactor->csrMat->num_rows = n;
442:         loTriFactor->csrMat->num_cols = n;
443:         loTriFactor->csrMat->num_entries = nzLower;

445:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

448:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

451:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452:         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

454:         /* Create the solve analysis information */
455:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
456:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
457:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464:       #endif

466:         /* perform the solve analysis */
467:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
471:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472:                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473:                                #endif
474: );CHKERRCUSPARSE(stat);
475:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
476:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

478:         /* assign the pointer */
479:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
480:         loTriFactor->AA_h = AALo;
481:         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
482:         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
483:         PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
484:       } else { /* update values only */
485:         if (!loTriFactor->AA_h) {
486:           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
487:         }
488:         /* Fill the lower triangular matrix */
489:         loTriFactor->AA_h[0]  = 1.0;
490:         v        = aa;
491:         vi       = aj;
492:         offset   = 1;
493:         for (i=1; i<n; i++) {
494:           nz = ai[i+1] - ai[i];
495:           PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
496:           offset      += nz;
497:           loTriFactor->AA_h[offset] = 1.0;
498:           offset      += 1;
499:           v  += nz;
500:         }
501:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502:         PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
503:       }
504:     } catch(char *ex) {
505:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
506:     }
507:   }
508:   return(0);
509: }

511: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
512: {
513:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
514:   PetscInt                          n = A->rmap->n;
515:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
517:   cusparseStatus_t                  stat;
518:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
519:   const MatScalar                   *aa = a->a,*v;
520:   PetscInt                          *AiUp, *AjUp;
521:   PetscInt                          i,nz, nzUpper, offset;
522:   PetscErrorCode                    ierr;
523:   cudaError_t                       cerr;

526:   if (!n) return(0);
527:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528:     try {
529:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
530:       nzUpper = adiag[0]-adiag[n];
531:       if (!upTriFactor) {
532:         PetscScalar *AAUp;

534:         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

536:         /* Allocate Space for the upper triangular matrix */
537:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
538:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

540:         /* Fill the upper triangular matrix */
541:         AiUp[0]=(PetscInt) 0;
542:         AiUp[n]=nzUpper;
543:         offset = nzUpper;
544:         for (i=n-1; i>=0; i--) {
545:           v  = aa + adiag[i+1] + 1;
546:           vi = aj + adiag[i+1] + 1;

548:           /* number of elements NOT on the diagonal */
549:           nz = adiag[i] - adiag[i+1]-1;

551:           /* decrement the offset */
552:           offset -= (nz+1);

554:           /* first, set the diagonal elements */
555:           AjUp[offset] = (PetscInt) i;
556:           AAUp[offset] = (MatScalar)1./v[nz];
557:           AiUp[i]      = AiUp[i+1] - (nz+1);

559:           PetscArraycpy(&(AjUp[offset+1]), vi, nz);
560:           PetscArraycpy(&(AAUp[offset+1]), v, nz);
561:         }

563:         /* allocate space for the triangular factor information */
564:         PetscNew(&upTriFactor);
565:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

567:         /* Create the matrix description */
568:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
569:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
570:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572:        #else
573:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574:        #endif
575:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
576:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

578:         /* set the operation */
579:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

581:         /* set the matrix */
582:         upTriFactor->csrMat = new CsrMatrix;
583:         upTriFactor->csrMat->num_rows = n;
584:         upTriFactor->csrMat->num_cols = n;
585:         upTriFactor->csrMat->num_entries = nzUpper;

587:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

590:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

593:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594:         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

596:         /* Create the solve analysis information */
597:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
598:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601:                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602:                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603:                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604:                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606:       #endif

608:         /* perform the solve analysis */
609:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
613:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614:                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615:                                #endif
616: );CHKERRCUSPARSE(stat);
617:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
618:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

620:         /* assign the pointer */
621:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
622:         upTriFactor->AA_h = AAUp;
623:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
624:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
625:         PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
626:       } else {
627:         if (!upTriFactor->AA_h) {
628:           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
629:         }
630:         /* Fill the upper triangular matrix */
631:         offset = nzUpper;
632:         for (i=n-1; i>=0; i--) {
633:           v  = aa + adiag[i+1] + 1;

635:           /* number of elements NOT on the diagonal */
636:           nz = adiag[i] - adiag[i+1]-1;

638:           /* decrement the offset */
639:           offset -= (nz+1);

641:           /* first, set the diagonal elements */
642:           upTriFactor->AA_h[offset] = 1./v[nz];
643:           PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
644:         }
645:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646:         PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
647:       }
648:     } catch(char *ex) {
649:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650:     }
651:   }
652:   return(0);
653: }

655: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
656: {
657:   PetscErrorCode               ierr;
658:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
659:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
660:   IS                           isrow = a->row,iscol = a->icol;
661:   PetscBool                    row_identity,col_identity;
662:   PetscInt                     n = A->rmap->n;

665:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
667:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

669:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670:   cusparseTriFactors->nnz=a->nz;

672:   A->offloadmask = PETSC_OFFLOAD_BOTH;
673:   /* lower triangular indices */
674:   ISIdentity(isrow,&row_identity);
675:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
676:     const PetscInt *r;

678:     ISGetIndices(isrow,&r);
679:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680:     cusparseTriFactors->rpermIndices->assign(r, r+n);
681:     ISRestoreIndices(isrow,&r);
682:     PetscLogCpuToGpu(n*sizeof(PetscInt));
683:   }

685:   /* upper triangular indices */
686:   ISIdentity(iscol,&col_identity);
687:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
688:     const PetscInt *c;

690:     ISGetIndices(iscol,&c);
691:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692:     cusparseTriFactors->cpermIndices->assign(c, c+n);
693:     ISRestoreIndices(iscol,&c);
694:     PetscLogCpuToGpu(n*sizeof(PetscInt));
695:   }
696:   return(0);
697: }

699: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700: {
701:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
702:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705:   cusparseStatus_t                  stat;
706:   PetscErrorCode                    ierr;
707:   cudaError_t                       cerr;
708:   PetscInt                          *AiUp, *AjUp;
709:   PetscScalar                       *AAUp;
710:   PetscScalar                       *AALo;
711:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
713:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
714:   const MatScalar                   *aa = b->a,*v;

717:   if (!n) return(0);
718:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719:     try {
720:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722:       if (!upTriFactor && !loTriFactor) {
723:         /* Allocate Space for the upper triangular matrix */
724:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
725:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

727:         /* Fill the upper triangular matrix */
728:         AiUp[0]=(PetscInt) 0;
729:         AiUp[n]=nzUpper;
730:         offset = 0;
731:         for (i=0; i<n; i++) {
732:           /* set the pointers */
733:           v  = aa + ai[i];
734:           vj = aj + ai[i];
735:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

737:           /* first, set the diagonal elements */
738:           AjUp[offset] = (PetscInt) i;
739:           AAUp[offset] = (MatScalar)1.0/v[nz];
740:           AiUp[i]      = offset;
741:           AALo[offset] = (MatScalar)1.0/v[nz];

743:           offset+=1;
744:           if (nz>0) {
745:             PetscArraycpy(&(AjUp[offset]), vj, nz);
746:             PetscArraycpy(&(AAUp[offset]), v, nz);
747:             for (j=offset; j<offset+nz; j++) {
748:               AAUp[j] = -AAUp[j];
749:               AALo[j] = AAUp[j]/v[nz];
750:             }
751:             offset+=nz;
752:           }
753:         }

755:         /* allocate space for the triangular factor information */
756:         PetscNew(&upTriFactor);
757:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

759:         /* Create the matrix description */
760:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
761:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
762:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764:        #else
765:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766:        #endif
767:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
768:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

770:         /* set the matrix */
771:         upTriFactor->csrMat = new CsrMatrix;
772:         upTriFactor->csrMat->num_rows = A->rmap->n;
773:         upTriFactor->csrMat->num_cols = A->cmap->n;
774:         upTriFactor->csrMat->num_entries = a->nz;

776:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

779:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

782:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

785:         /* set the operation */
786:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

788:         /* Create the solve analysis information */
789:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
790:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
791:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793:                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794:                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795:                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796:                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798:       #endif

800:         /* perform the solve analysis */
801:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
805:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806:                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807:                                 #endif
808: );CHKERRCUSPARSE(stat);
809:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
810:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

812:         /* assign the pointer */
813:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

815:         /* allocate space for the triangular factor information */
816:         PetscNew(&loTriFactor);
817:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

819:         /* Create the matrix description */
820:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
821:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
822:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824:        #else
825:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826:        #endif
827:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
828:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

830:         /* set the operation */
831:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

833:         /* set the matrix */
834:         loTriFactor->csrMat = new CsrMatrix;
835:         loTriFactor->csrMat->num_rows = A->rmap->n;
836:         loTriFactor->csrMat->num_cols = A->cmap->n;
837:         loTriFactor->csrMat->num_entries = a->nz;

839:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

842:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

845:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

848:         /* Create the solve analysis information */
849:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
850:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
851:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858:       #endif

860:         /* perform the solve analysis */
861:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
865:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866:                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867:                                 #endif
868: );CHKERRCUSPARSE(stat);
869:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
870:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

872:         /* assign the pointer */
873:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

875:         PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
876:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
877:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878:       } else {
879:         /* Fill the upper triangular matrix */
880:         offset = 0;
881:         for (i=0; i<n; i++) {
882:           /* set the pointers */
883:           v  = aa + ai[i];
884:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

886:           /* first, set the diagonal elements */
887:           AAUp[offset] = 1.0/v[nz];
888:           AALo[offset] = 1.0/v[nz];

890:           offset+=1;
891:           if (nz>0) {
892:             PetscArraycpy(&(AAUp[offset]), v, nz);
893:             for (j=offset; j<offset+nz; j++) {
894:               AAUp[j] = -AAUp[j];
895:               AALo[j] = AAUp[j]/v[nz];
896:             }
897:             offset+=nz;
898:           }
899:         }
900:         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901:         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904:         PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
905:       }
906:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
907:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908:     } catch(char *ex) {
909:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910:     }
911:   }
912:   return(0);
913: }

915: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
916: {
917:   PetscErrorCode               ierr;
918:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
919:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920:   IS                           ip = a->row;
921:   PetscBool                    perm_identity;
922:   PetscInt                     n = A->rmap->n;

925:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
927:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

930:   A->offloadmask = PETSC_OFFLOAD_BOTH;

932:   /* lower triangular indices */
933:   ISIdentity(ip,&perm_identity);
934:   if (!perm_identity) {
935:     IS             iip;
936:     const PetscInt *irip,*rip;

938:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
939:     ISGetIndices(iip,&irip);
940:     ISGetIndices(ip,&rip);
941:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
944:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
945:     ISRestoreIndices(iip,&irip);
946:     ISDestroy(&iip);
947:     ISRestoreIndices(ip,&rip);
948:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
949:   }
950:   return(0);
951: }

953: #define CHECK_LAUNCH_ERROR()                                                             \
954: do {                                                                                     \
955:   /* Check synchronous errors, i.e. pre-launch */                                        \
956:   cudaError_t err = cudaGetLastError();                                                  \
957:   if (cudaSuccess != err) {                                                              \
958:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
959:   }                                                                                      \
960:   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
961:   err = cudaDeviceSynchronize();                                                         \
962:   if (cudaSuccess != err) {                                                              \
963:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
964:   }                                                                                      \
965:  } while (0)

967: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968: {
969:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970:   IS             ip = b->row;
971:   PetscBool      perm_identity;

975:   MatSeqAIJCUSPARSECopyFromGPU(A);
976:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
977:   B->offloadmask = PETSC_OFFLOAD_CPU;
978:   /* determine which version of MatSolve needs to be used. */
979:   ISIdentity(ip,&perm_identity);
980:   if (perm_identity) {
981:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
983:     B->ops->matsolve = NULL;
984:     B->ops->matsolvetranspose = NULL;
985:   } else {
986:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
988:     B->ops->matsolve = NULL;
989:     B->ops->matsolvetranspose = NULL;
990:   }

992:   /* get the triangular factors */
993:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
994:   return(0);
995: }

997: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998: {
999:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004:   cusparseStatus_t                  stat;
1005:   cusparseIndexBase_t               indexBase;
1006:   cusparseMatrixType_t              matrixType;
1007:   cusparseFillMode_t                fillMode;
1008:   cusparseDiagType_t                diagType;
1009:   cudaError_t                       cerr;
1010:   PetscErrorCode                    ierr;

1013:   /* allocate space for the transpose of the lower triangular factor */
1014:   PetscNew(&loTriFactorT);
1015:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1017:   /* set the matrix descriptors of the lower triangular factor */
1018:   matrixType = cusparseGetMatType(loTriFactor->descr);
1019:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

1024:   /* Create the matrix description */
1025:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1026:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1027:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1028:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1029:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1031:   /* set the operation */
1032:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1034:   /* allocate GPU space for the CSC of the lower triangular factor*/
1035:   loTriFactorT->csrMat = new CsrMatrix;
1036:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1043:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046:                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047:                                        loTriFactor->csrMat->values->data().get(),
1048:                                        loTriFactor->csrMat->row_offsets->data().get(),
1049:                                        loTriFactor->csrMat->column_indices->data().get(),
1050:                                        loTriFactorT->csrMat->values->data().get(),
1051:                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052:                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053:                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1054:   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055: #endif

1057:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1058:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060:                           loTriFactor->csrMat->values->data().get(),
1061:                           loTriFactor->csrMat->row_offsets->data().get(),
1062:                           loTriFactor->csrMat->column_indices->data().get(),
1063:                           loTriFactorT->csrMat->values->data().get(),
1064:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067:                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068:                         #else
1069:                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070:                           CUSPARSE_ACTION_NUMERIC, indexBase
1071:                         #endif
1072: );CHKERRCUSPARSE(stat);
1073:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1076:   /* Create the solve analysis information */
1077:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1078:   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1079: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084:                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085:   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086: #endif

1088:   /* perform the solve analysis */
1089:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092:                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1093:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094:                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095:                           #endif
1096: );CHKERRCUSPARSE(stat);
1097:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1100:   /* assign the pointer */
1101:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1103:   /*********************************************/
1104:   /* Now the Transpose of the Upper Tri Factor */
1105:   /*********************************************/

1107:   /* allocate space for the transpose of the upper triangular factor */
1108:   PetscNew(&upTriFactorT);
1109:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1111:   /* set the matrix descriptors of the upper triangular factor */
1112:   matrixType = cusparseGetMatType(upTriFactor->descr);
1113:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

1118:   /* Create the matrix description */
1119:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1120:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1121:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1122:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1123:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1125:   /* set the operation */
1126:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1128:   /* allocate GPU space for the CSC of the upper triangular factor*/
1129:   upTriFactorT->csrMat = new CsrMatrix;
1130:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1137:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140:                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141:                                 upTriFactor->csrMat->values->data().get(),
1142:                                 upTriFactor->csrMat->row_offsets->data().get(),
1143:                                 upTriFactor->csrMat->column_indices->data().get(),
1144:                                 upTriFactorT->csrMat->values->data().get(),
1145:                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146:                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147:                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148:   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149: #endif

1151:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1152:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154:                           upTriFactor->csrMat->values->data().get(),
1155:                           upTriFactor->csrMat->row_offsets->data().get(),
1156:                           upTriFactor->csrMat->column_indices->data().get(),
1157:                           upTriFactorT->csrMat->values->data().get(),
1158:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161:                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162:                         #else
1163:                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164:                           CUSPARSE_ACTION_NUMERIC, indexBase
1165:                         #endif
1166: );CHKERRCUSPARSE(stat);
1167:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1170:   /* Create the solve analysis information */
1171:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1172:   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1173:   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178:                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179:   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180:   #endif

1182:   /* perform the solve analysis */
1183:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186:                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1187:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188:                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189:                           #endif
1190: );CHKERRCUSPARSE(stat);
1191:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1194:   /* assign the pointer */
1195:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196:   return(0);
1197: }

1199: struct PetscScalarToPetscInt
1200: {
1201:   __host__ __device__
1202:   PetscInt operator()(PetscScalar s)
1203:   {
1204:     return (PetscInt)PetscRealPart(s);
1205:   }
1206: };

1208: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209: {
1210:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213:   cusparseStatus_t             stat;
1214:   cusparseIndexBase_t          indexBase;
1215:   cudaError_t                  err;
1216:   PetscErrorCode               ierr;

1219:   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) return(0);
1220:   MatSeqAIJCUSPARSECopyToGPU(A);
1221:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222:   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1223:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224:   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1225:   if (A->transupdated) return(0);
1226:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1227:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228:     MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1229:   }
1230:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232:     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234:     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235:     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1237:     /* set alpha and beta */
1238:     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239:     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240:     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241:     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242:     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243:     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

1245:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246:       CsrMatrix *matrixT = new CsrMatrix;
1247:       matstructT->mat = matrixT;
1248:       matrixT->num_rows = A->cmap->n;
1249:       matrixT->num_cols = A->rmap->n;
1250:       matrixT->num_entries = a->nz;
1251:       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253:       matrixT->values = new THRUSTARRAY(a->nz);

1255:       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);

1258:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259:       stat = cusparseCreateCsr(&matstructT->matDescr,
1260:                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1261:                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1262:                                matrixT->values->data().get(),
1263:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1264:                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1265:      #endif
1266:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1267:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1269:    #else
1270:       CsrMatrix *temp  = new CsrMatrix;
1271:       CsrMatrix *tempT = new CsrMatrix;
1272:       /* First convert HYB to CSR */
1273:       temp->num_rows = A->rmap->n;
1274:       temp->num_cols = A->cmap->n;
1275:       temp->num_entries = a->nz;
1276:       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1278:       temp->values = new THRUSTARRAY(a->nz);

1280:       stat = cusparse_hyb2csr(cusparsestruct->handle,
1281:                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1282:                               temp->values->data().get(),
1283:                               temp->row_offsets->data().get(),
1284:                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);

1286:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1287:       tempT->num_rows = A->rmap->n;
1288:       tempT->num_cols = A->cmap->n;
1289:       tempT->num_entries = a->nz;
1290:       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1292:       tempT->values = new THRUSTARRAY(a->nz);

1294:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1295:                               temp->num_cols, temp->num_entries,
1296:                               temp->values->data().get(),
1297:                               temp->row_offsets->data().get(),
1298:                               temp->column_indices->data().get(),
1299:                               tempT->values->data().get(),
1300:                               tempT->column_indices->data().get(),
1301:                               tempT->row_offsets->data().get(),
1302:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

1304:       /* Last, convert CSC to HYB */
1305:       cusparseHybMat_t hybMat;
1306:       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1307:       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308:         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309:       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1310:                               matstructT->descr, tempT->values->data().get(),
1311:                               tempT->row_offsets->data().get(),
1312:                               tempT->column_indices->data().get(),
1313:                               hybMat, 0, partition);CHKERRCUSPARSE(stat);

1315:       /* assign the pointer */
1316:       matstructT->mat = hybMat;
1317:       A->transupdated = PETSC_TRUE;
1318:       /* delete temporaries */
1319:       if (tempT) {
1320:         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321:         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322:         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323:         delete (CsrMatrix*) tempT;
1324:       }
1325:       if (temp) {
1326:         if (temp->values) delete (THRUSTARRAY*) temp->values;
1327:         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328:         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329:         delete (CsrMatrix*) temp;
1330:       }
1331:      #endif
1332:     }
1333:   }
1334:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1335:     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1336:     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1337:     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1338:     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1339:     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1340:     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1341:     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1342:     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1343:     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1344:     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1345:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1346:       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1347:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1348:       PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1349:     }
1350:     if (!cusparsestruct->csr2csc_i) {
1351:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1352:       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1354:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1355:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1356:       void   *csr2cscBuffer;
1357:       size_t csr2cscBufferSize;
1358:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1359:                                            A->cmap->n, matrix->num_entries,
1360:                                            matrix->values->data().get(),
1361:                                            cusparsestruct->rowoffsets_gpu->data().get(),
1362:                                            matrix->column_indices->data().get(),
1363:                                            matrixT->values->data().get(),
1364:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1365:                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1366:                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1367:       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1368:      #endif

1370:       if (matrix->num_entries) {
1371:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1375:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376:            should be filled with indexBase. So I just take a shortcut here.
1377:         */
1378:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1379:                               A->cmap->n,matrix->num_entries,
1380:                               csr2csc_a.data().get(),
1381:                               cusparsestruct->rowoffsets_gpu->data().get(),
1382:                               matrix->column_indices->data().get(),
1383:                               matrixT->values->data().get(),
1384:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385:                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1386:                               CUSPARSE_ACTION_NUMERIC,indexBase,
1387:                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1388:                              #else
1389:                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1390:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1391:                              #endif
1392:       } else {
1393:         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1394:       }

1396:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1397:       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1398:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1399:       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1400:      #endif
1401:     }
1402:     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1403:                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1404:                                                      matrixT->values->begin()));
1405:   }
1406:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1407:   /* the compressed row indices is not used for matTranspose */
1408:   matstructT->cprowIndices = NULL;
1409:   /* assign the pointer */
1410:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1411:   A->transupdated = PETSC_TRUE;
1412:   return(0);
1413: }

1415: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1416: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1417: {
1418:   PetscInt                              n = xx->map->n;
1419:   const PetscScalar                     *barray;
1420:   PetscScalar                           *xarray;
1421:   thrust::device_ptr<const PetscScalar> bGPU;
1422:   thrust::device_ptr<PetscScalar>       xGPU;
1423:   cusparseStatus_t                      stat;
1424:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1425:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1426:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1427:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1428:   PetscErrorCode                        ierr;
1429:   cudaError_t                           cerr;

1432:   /* Analyze the matrix and create the transpose ... on the fly */
1433:   if (!loTriFactorT && !upTriFactorT) {
1434:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1435:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1436:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1437:   }

1439:   /* Get the GPU pointers */
1440:   VecCUDAGetArrayWrite(xx,&xarray);
1441:   VecCUDAGetArrayRead(bb,&barray);
1442:   xGPU = thrust::device_pointer_cast(xarray);
1443:   bGPU = thrust::device_pointer_cast(barray);

1445:   PetscLogGpuTimeBegin();
1446:   /* First, reorder with the row permutation */
1447:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1448:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1449:                xGPU);

1451:   /* First, solve U */
1452:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1453:                         upTriFactorT->csrMat->num_rows,
1454:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1455:                         upTriFactorT->csrMat->num_entries,
1456:                       #endif
1457:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1458:                         upTriFactorT->csrMat->values->data().get(),
1459:                         upTriFactorT->csrMat->row_offsets->data().get(),
1460:                         upTriFactorT->csrMat->column_indices->data().get(),
1461:                         upTriFactorT->solveInfo,
1462:                         xarray, tempGPU->data().get()
1463:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1464:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1465:                       #endif
1466: );CHKERRCUSPARSE(stat);

1468:   /* Then, solve L */
1469:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470:                         loTriFactorT->csrMat->num_rows,
1471:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472:                         loTriFactorT->csrMat->num_entries,
1473:                       #endif
1474:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475:                         loTriFactorT->csrMat->values->data().get(),
1476:                         loTriFactorT->csrMat->row_offsets->data().get(),
1477:                         loTriFactorT->csrMat->column_indices->data().get(),
1478:                         loTriFactorT->solveInfo,
1479:                         tempGPU->data().get(), xarray
1480:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1482:                       #endif
1483: );CHKERRCUSPARSE(stat);

1485:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1486:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1487:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1488:                tempGPU->begin());

1490:   /* Copy the temporary to the full solution. */
1491:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);

1493:   /* restore */
1494:   VecCUDARestoreArrayRead(bb,&barray);
1495:   VecCUDARestoreArrayWrite(xx,&xarray);
1496:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1497:   PetscLogGpuTimeEnd();
1498:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1499:   return(0);
1500: }

1502: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1503: {
1504:   const PetscScalar                 *barray;
1505:   PetscScalar                       *xarray;
1506:   cusparseStatus_t                  stat;
1507:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1508:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1511:   PetscErrorCode                    ierr;
1512:   cudaError_t                       cerr;

1515:   /* Analyze the matrix and create the transpose ... on the fly */
1516:   if (!loTriFactorT && !upTriFactorT) {
1517:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1518:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520:   }

1522:   /* Get the GPU pointers */
1523:   VecCUDAGetArrayWrite(xx,&xarray);
1524:   VecCUDAGetArrayRead(bb,&barray);

1526:   PetscLogGpuTimeBegin();
1527:   /* First, solve U */
1528:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529:                         upTriFactorT->csrMat->num_rows,
1530:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531:                         upTriFactorT->csrMat->num_entries,
1532:                       #endif
1533:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534:                         upTriFactorT->csrMat->values->data().get(),
1535:                         upTriFactorT->csrMat->row_offsets->data().get(),
1536:                         upTriFactorT->csrMat->column_indices->data().get(),
1537:                         upTriFactorT->solveInfo,
1538:                         barray, tempGPU->data().get()
1539:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1541:                       #endif
1542: );CHKERRCUSPARSE(stat);

1544:   /* Then, solve L */
1545:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1546:                         loTriFactorT->csrMat->num_rows,
1547:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548:                         loTriFactorT->csrMat->num_entries,
1549:                       #endif
1550:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1551:                         loTriFactorT->csrMat->values->data().get(),
1552:                         loTriFactorT->csrMat->row_offsets->data().get(),
1553:                         loTriFactorT->csrMat->column_indices->data().get(),
1554:                         loTriFactorT->solveInfo,
1555:                         tempGPU->data().get(), xarray
1556:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1557:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1558:                       #endif
1559: );CHKERRCUSPARSE(stat);

1561:   /* restore */
1562:   VecCUDARestoreArrayRead(bb,&barray);
1563:   VecCUDARestoreArrayWrite(xx,&xarray);
1564:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1565:   PetscLogGpuTimeEnd();
1566:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1567:   return(0);
1568: }

1570: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1571: {
1572:   const PetscScalar                     *barray;
1573:   PetscScalar                           *xarray;
1574:   thrust::device_ptr<const PetscScalar> bGPU;
1575:   thrust::device_ptr<PetscScalar>       xGPU;
1576:   cusparseStatus_t                      stat;
1577:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1578:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1579:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1580:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1581:   PetscErrorCode                        ierr;
1582:   cudaError_t                           cerr;


1586:   /* Get the GPU pointers */
1587:   VecCUDAGetArrayWrite(xx,&xarray);
1588:   VecCUDAGetArrayRead(bb,&barray);
1589:   xGPU = thrust::device_pointer_cast(xarray);
1590:   bGPU = thrust::device_pointer_cast(barray);

1592:   PetscLogGpuTimeBegin();
1593:   /* First, reorder with the row permutation */
1594:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1595:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1596:                tempGPU->begin());

1598:   /* Next, solve L */
1599:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1600:                         loTriFactor->csrMat->num_rows,
1601:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602:                         loTriFactor->csrMat->num_entries,
1603:                       #endif
1604:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1605:                         loTriFactor->csrMat->values->data().get(),
1606:                         loTriFactor->csrMat->row_offsets->data().get(),
1607:                         loTriFactor->csrMat->column_indices->data().get(),
1608:                         loTriFactor->solveInfo,
1609:                         tempGPU->data().get(), xarray
1610:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1612:                       #endif
1613: );CHKERRCUSPARSE(stat);

1615:   /* Then, solve U */
1616:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1617:                         upTriFactor->csrMat->num_rows,
1618:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619:                         upTriFactor->csrMat->num_entries,
1620:                       #endif
1621:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1622:                         upTriFactor->csrMat->values->data().get(),
1623:                         upTriFactor->csrMat->row_offsets->data().get(),
1624:                         upTriFactor->csrMat->column_indices->data().get(),
1625:                         upTriFactor->solveInfo,
1626:                         xarray, tempGPU->data().get()
1627:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1629:                       #endif
1630: );CHKERRCUSPARSE(stat);

1632:   /* Last, reorder with the column permutation */
1633:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1634:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1635:                xGPU);

1637:   VecCUDARestoreArrayRead(bb,&barray);
1638:   VecCUDARestoreArrayWrite(xx,&xarray);
1639:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1640:   PetscLogGpuTimeEnd();
1641:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1642:   return(0);
1643: }

1645: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1646: {
1647:   const PetscScalar                 *barray;
1648:   PetscScalar                       *xarray;
1649:   cusparseStatus_t                  stat;
1650:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1651:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1652:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1653:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1654:   PetscErrorCode                    ierr;
1655:   cudaError_t                       cerr;

1658:   /* Get the GPU pointers */
1659:   VecCUDAGetArrayWrite(xx,&xarray);
1660:   VecCUDAGetArrayRead(bb,&barray);

1662:   PetscLogGpuTimeBegin();
1663:   /* First, solve L */
1664:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1665:                         loTriFactor->csrMat->num_rows,
1666:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1667:                         loTriFactor->csrMat->num_entries,
1668:                       #endif
1669:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1670:                         loTriFactor->csrMat->values->data().get(),
1671:                         loTriFactor->csrMat->row_offsets->data().get(),
1672:                         loTriFactor->csrMat->column_indices->data().get(),
1673:                         loTriFactor->solveInfo,
1674:                         barray, tempGPU->data().get()
1675:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1676:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1677:                       #endif
1678: );CHKERRCUSPARSE(stat);

1680:   /* Next, solve U */
1681:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1682:                         upTriFactor->csrMat->num_rows,
1683:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1684:                         upTriFactor->csrMat->num_entries,
1685:                       #endif
1686:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1687:                         upTriFactor->csrMat->values->data().get(),
1688:                         upTriFactor->csrMat->row_offsets->data().get(),
1689:                         upTriFactor->csrMat->column_indices->data().get(),
1690:                         upTriFactor->solveInfo,
1691:                         tempGPU->data().get(), xarray
1692:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1693:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1694:                       #endif
1695: );CHKERRCUSPARSE(stat);

1697:   VecCUDARestoreArrayRead(bb,&barray);
1698:   VecCUDARestoreArrayWrite(xx,&xarray);
1699:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1700:   PetscLogGpuTimeEnd();
1701:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1702:   return(0);
1703: }

1705: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1706: {
1707:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1708:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1709:   cudaError_t        cerr;
1710:   PetscErrorCode     ierr;

1713:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1714:     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;

1716:     PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1717:     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1718:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1719:     PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1720:     PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1721:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1722:   }
1723:   return(0);
1724: }

1726: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1727: {
1728:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

1732:   MatSeqAIJCUSPARSECopyFromGPU(A);
1733:   *array = a->a;
1734:   A->offloadmask = PETSC_OFFLOAD_CPU;
1735:   return(0);
1736: }

1738: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1739: {
1740:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1741:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1742:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1743:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1744:   PetscErrorCode               ierr;
1745:   cusparseStatus_t             stat;
1746:   PetscBool                    both = PETSC_TRUE;
1747:   cudaError_t                  err;

1750:   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1751:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1752:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1753:       CsrMatrix *matrix;
1754:       matrix = (CsrMatrix*)cusparsestruct->mat->mat;

1756:       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1757:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1758:       matrix->values->assign(a->a, a->a+a->nz);
1759:       err  = WaitForCUDA();CHKERRCUDA(err);
1760:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1761:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1762:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1763:     } else {
1764:       PetscInt nnz;
1765:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1766:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1767:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1768:       delete cusparsestruct->workVector;
1769:       delete cusparsestruct->rowoffsets_gpu;
1770:       cusparsestruct->workVector = NULL;
1771:       cusparsestruct->rowoffsets_gpu = NULL;
1772:       try {
1773:         if (a->compressedrow.use) {
1774:           m    = a->compressedrow.nrows;
1775:           ii   = a->compressedrow.i;
1776:           ridx = a->compressedrow.rindex;
1777:         } else {
1778:           m    = A->rmap->n;
1779:           ii   = a->i;
1780:           ridx = NULL;
1781:         }
1782:         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1783:         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1784:         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1785:         else nnz = a->nz;

1787:         /* create cusparse matrix */
1788:         cusparsestruct->nrows = m;
1789:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1790:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1791:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1792:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1794:         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1795:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1796:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1797:         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1798:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1799:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1800:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1802:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1803:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1804:           /* set the matrix */
1805:           CsrMatrix *mat= new CsrMatrix;
1806:           mat->num_rows = m;
1807:           mat->num_cols = A->cmap->n;
1808:           mat->num_entries = nnz;
1809:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1810:           mat->row_offsets->assign(ii, ii + m+1);

1812:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1813:           mat->column_indices->assign(a->j, a->j+nnz);

1815:           mat->values = new THRUSTARRAY(nnz);
1816:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1818:           /* assign the pointer */
1819:           matstruct->mat = mat;
1820:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1821:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1822:             stat = cusparseCreateCsr(&matstruct->matDescr,
1823:                                     mat->num_rows, mat->num_cols, mat->num_entries,
1824:                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1825:                                     mat->values->data().get(),
1826:                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1827:                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1828:           }
1829:          #endif
1830:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1831:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1832:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1833:          #else
1834:           CsrMatrix *mat= new CsrMatrix;
1835:           mat->num_rows = m;
1836:           mat->num_cols = A->cmap->n;
1837:           mat->num_entries = nnz;
1838:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1839:           mat->row_offsets->assign(ii, ii + m+1);

1841:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1842:           mat->column_indices->assign(a->j, a->j+nnz);

1844:           mat->values = new THRUSTARRAY(nnz);
1845:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1847:           cusparseHybMat_t hybMat;
1848:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1849:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1850:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1851:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1852:               matstruct->descr, mat->values->data().get(),
1853:               mat->row_offsets->data().get(),
1854:               mat->column_indices->data().get(),
1855:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1856:           /* assign the pointer */
1857:           matstruct->mat = hybMat;

1859:           if (mat) {
1860:             if (mat->values) delete (THRUSTARRAY*)mat->values;
1861:             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1862:             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1863:             delete (CsrMatrix*)mat;
1864:           }
1865:          #endif
1866:         }

1868:         /* assign the compressed row indices */
1869:         if (a->compressedrow.use) {
1870:           cusparsestruct->workVector = new THRUSTARRAY(m);
1871:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1872:           matstruct->cprowIndices->assign(ridx,ridx+m);
1873:           tmp = m;
1874:         } else {
1875:           cusparsestruct->workVector = NULL;
1876:           matstruct->cprowIndices    = NULL;
1877:           tmp = 0;
1878:         }
1879:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1881:         /* assign the pointer */
1882:         cusparsestruct->mat = matstruct;
1883:       } catch(char *ex) {
1884:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1885:       }
1886:       err  = WaitForCUDA();CHKERRCUDA(err);
1887:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1888:       cusparsestruct->nonzerostate = A->nonzerostate;
1889:     }
1890:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1891:   }
1892:   return(0);
1893: }

1895: struct VecCUDAPlusEquals
1896: {
1897:   template <typename Tuple>
1898:   __host__ __device__
1899:   void operator()(Tuple t)
1900:   {
1901:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1902:   }
1903: };

1905: struct VecCUDAEquals
1906: {
1907:   template <typename Tuple>
1908:   __host__ __device__
1909:   void operator()(Tuple t)
1910:   {
1911:     thrust::get<1>(t) = thrust::get<0>(t);
1912:   }
1913: };

1915: struct VecCUDAEqualsReverse
1916: {
1917:   template <typename Tuple>
1918:   __host__ __device__
1919:   void operator()(Tuple t)
1920:   {
1921:     thrust::get<0>(t) = thrust::get<1>(t);
1922:   }
1923: };

1925: struct MatMatCusparse {
1926:   PetscBool             cisdense;
1927:   PetscScalar           *Bt;
1928:   Mat                   X;
1929:   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1930:   PetscLogDouble        flops;
1931:   CsrMatrix             *Bcsr;
1932: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1933:   cusparseSpMatDescr_t  matSpBDescr;
1934:   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1935:   cusparseDnMatDescr_t  matBDescr;
1936:   cusparseDnMatDescr_t  matCDescr;
1937:   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1938:   size_t                mmBufferSize;
1939:   void                  *mmBuffer;
1940:   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1941:   cusparseSpGEMMDescr_t spgemmDesc;
1942: #endif
1943: };

1945: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1946: {
1947:   PetscErrorCode   ierr;
1948:   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1949:   cudaError_t      cerr;
1950:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951:   cusparseStatus_t stat;
1952:  #endif

1955:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1956:   delete mmdata->Bcsr;
1957:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1958:   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1959:   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1960:   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1961:   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1962:   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1963:   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1964:  #endif
1965:   MatDestroy(&mmdata->X);
1966:   PetscFree(data);
1967:   return(0);
1968: }

1970: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

1972: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1973: {
1974:   Mat_Product                  *product = C->product;
1975:   Mat                          A,B;
1976:   PetscInt                     m,n,blda,clda;
1977:   PetscBool                    flg,biscuda;
1978:   Mat_SeqAIJCUSPARSE           *cusp;
1979:   cusparseStatus_t             stat;
1980:   cusparseOperation_t          opA;
1981:   const PetscScalar            *barray;
1982:   PetscScalar                  *carray;
1983:   PetscErrorCode               ierr;
1984:   MatMatCusparse               *mmdata;
1985:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1986:   CsrMatrix                    *csrmat;
1987:   cudaError_t                  cerr;

1990:   MatCheckProduct(C,1);
1991:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1992:   mmdata = (MatMatCusparse*)product->data;
1993:   A    = product->A;
1994:   B    = product->B;
1995:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1996:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1997:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1998:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1999:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2000:   MatSeqAIJCUSPARSECopyToGPU(A);
2001:   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2002:   switch (product->type) {
2003:   case MATPRODUCT_AB:
2004:   case MATPRODUCT_PtAP:
2005:     mat = cusp->mat;
2006:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2007:     m   = A->rmap->n;
2008:     n   = B->cmap->n;
2009:     break;
2010:   case MATPRODUCT_AtB:
2011:     if (!A->form_explicit_transpose) {
2012:       mat = cusp->mat;
2013:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2014:     } else {
2015:       MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2016:       mat  = cusp->matTranspose;
2017:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2018:     }
2019:     m = A->cmap->n;
2020:     n = B->cmap->n;
2021:     break;
2022:   case MATPRODUCT_ABt:
2023:   case MATPRODUCT_RARt:
2024:     mat = cusp->mat;
2025:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2026:     m   = A->rmap->n;
2027:     n   = B->rmap->n;
2028:     break;
2029:   default:
2030:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2031:   }
2032:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2033:   csrmat = (CsrMatrix*)mat->mat;
2034:   /* if the user passed a CPU matrix, copy the data to the GPU */
2035:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2036:   if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
2037:   MatDenseCUDAGetArrayRead(B,&barray);

2039:   MatDenseGetLDA(B,&blda);
2040:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2041:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2042:     MatDenseGetLDA(mmdata->X,&clda);
2043:   } else {
2044:     MatDenseCUDAGetArrayWrite(C,&carray);
2045:     MatDenseGetLDA(C,&clda);
2046:   }

2048:   PetscLogGpuTimeBegin();
2049:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2050:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2051:   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2052:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2053:     size_t mmBufferSize;
2054:     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2055:     if (!mmdata->matBDescr) {
2056:       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2057:       mmdata->Blda = blda;
2058:     }

2060:     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2061:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2062:       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2063:       mmdata->Clda = clda;
2064:     }

2066:     if (!mat->matDescr) {
2067:       stat = cusparseCreateCsr(&mat->matDescr,
2068:                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2069:                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2070:                                csrmat->values->data().get(),
2071:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2072:                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2073:     }
2074:     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2075:                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2076:                                    mmdata->matCDescr,cusparse_scalartype,
2077:                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2078:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2079:       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2080:       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2081:       mmdata->mmBufferSize = mmBufferSize;
2082:     }
2083:     mmdata->initialized = PETSC_TRUE;
2084:   } else {
2085:     /* to be safe, always update pointers of the mats */
2086:     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2087:     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2088:     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2089:   }

2091:   /* do cusparseSpMM, which supports transpose on B */
2092:   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2093:                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2094:                       mmdata->matCDescr,cusparse_scalartype,
2095:                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2096:  #else
2097:   PetscInt k;
2098:   /* cusparseXcsrmm does not support transpose on B */
2099:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2100:     cublasHandle_t cublasv2handle;
2101:     cublasStatus_t cerr;

2103:     PetscCUBLASGetHandle(&cublasv2handle);
2104:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2105:                        B->cmap->n,B->rmap->n,
2106:                        &PETSC_CUSPARSE_ONE ,barray,blda,
2107:                        &PETSC_CUSPARSE_ZERO,barray,blda,
2108:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2109:     blda = B->cmap->n;
2110:     k    = B->cmap->n;
2111:   } else {
2112:     k    = B->rmap->n;
2113:   }

2115:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2116:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2117:                            csrmat->num_entries,mat->alpha_one,mat->descr,
2118:                            csrmat->values->data().get(),
2119:                            csrmat->row_offsets->data().get(),
2120:                            csrmat->column_indices->data().get(),
2121:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2122:                            carray,clda);CHKERRCUSPARSE(stat);
2123:  #endif
2124:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2125:   PetscLogGpuTimeEnd();
2126:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2127:   MatDenseCUDARestoreArrayRead(B,&barray);
2128:   if (product->type == MATPRODUCT_RARt) {
2129:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2130:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2131:   } else if (product->type == MATPRODUCT_PtAP) {
2132:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2133:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2134:   } else {
2135:     MatDenseCUDARestoreArrayWrite(C,&carray);
2136:   }
2137:   if (mmdata->cisdense) {
2138:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2139:   }
2140:   if (!biscuda) {
2141:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2142:   }
2143:   return(0);
2144: }

2146: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2147: {
2148:   Mat_Product        *product = C->product;
2149:   Mat                A,B;
2150:   PetscInt           m,n;
2151:   PetscBool          cisdense,flg;
2152:   PetscErrorCode     ierr;
2153:   MatMatCusparse     *mmdata;
2154:   Mat_SeqAIJCUSPARSE *cusp;

2157:   MatCheckProduct(C,1);
2158:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2159:   A    = product->A;
2160:   B    = product->B;
2161:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2162:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2163:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2164:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2165:   switch (product->type) {
2166:   case MATPRODUCT_AB:
2167:     m = A->rmap->n;
2168:     n = B->cmap->n;
2169:     break;
2170:   case MATPRODUCT_AtB:
2171:     m = A->cmap->n;
2172:     n = B->cmap->n;
2173:     break;
2174:   case MATPRODUCT_ABt:
2175:     m = A->rmap->n;
2176:     n = B->rmap->n;
2177:     break;
2178:   case MATPRODUCT_PtAP:
2179:     m = B->cmap->n;
2180:     n = B->cmap->n;
2181:     break;
2182:   case MATPRODUCT_RARt:
2183:     m = B->rmap->n;
2184:     n = B->rmap->n;
2185:     break;
2186:   default:
2187:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2188:   }
2189:   MatSetSizes(C,m,n,m,n);
2190:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2191:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2192:   MatSetType(C,MATSEQDENSECUDA);

2194:   /* product data */
2195:   PetscNew(&mmdata);
2196:   mmdata->cisdense = cisdense;
2197:  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2198:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2199:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2200:     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2201:   }
2202:  #endif
2203:   /* for these products we need intermediate storage */
2204:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2205:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2206:     MatSetType(mmdata->X,MATSEQDENSECUDA);
2207:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2208:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2209:     } else {
2210:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2211:     }
2212:   }
2213:   C->product->data    = mmdata;
2214:   C->product->destroy = MatDestroy_MatMatCusparse;

2216:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2217:   return(0);
2218: }

2220: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2221: {
2222:   Mat_Product                  *product = C->product;
2223:   Mat                          A,B;
2224:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2225:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2226:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2227:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2228:   PetscBool                    flg;
2229:   PetscErrorCode               ierr;
2230:   cusparseStatus_t             stat;
2231:   cudaError_t                  cerr;
2232:   MatProductType               ptype;
2233:   MatMatCusparse               *mmdata;
2234: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2235:   cusparseSpMatDescr_t         BmatSpDescr;
2236: #endif

2239:   MatCheckProduct(C,1);
2240:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2241:   PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2242:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2243:   mmdata = (MatMatCusparse*)C->product->data;
2244:   A = product->A;
2245:   B = product->B;
2246:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2247:     mmdata->reusesym = PETSC_FALSE;
2248:     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2249:     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2250:     Cmat = Ccusp->mat;
2251:     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2252:     Ccsr = (CsrMatrix*)Cmat->mat;
2253:     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2254:     goto finalize;
2255:   }
2256:   if (!c->nz) goto finalize;
2257:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2258:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2259:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2260:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2261:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2262:   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2263:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2264:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2265:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2266:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2267:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2268:   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2269:   MatSeqAIJCUSPARSECopyToGPU(A);
2270:   MatSeqAIJCUSPARSECopyToGPU(B);

2272:   ptype = product->type;
2273:   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2274:   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2275:   switch (ptype) {
2276:   case MATPRODUCT_AB:
2277:     Amat = Acusp->mat;
2278:     Bmat = Bcusp->mat;
2279:     break;
2280:   case MATPRODUCT_AtB:
2281:     Amat = Acusp->matTranspose;
2282:     Bmat = Bcusp->mat;
2283:     break;
2284:   case MATPRODUCT_ABt:
2285:     Amat = Acusp->mat;
2286:     Bmat = Bcusp->matTranspose;
2287:     break;
2288:   default:
2289:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2290:   }
2291:   Cmat = Ccusp->mat;
2292:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2293:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2294:   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2295:   Acsr = (CsrMatrix*)Amat->mat;
2296:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2297:   Ccsr = (CsrMatrix*)Cmat->mat;
2298:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2299:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2300:   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2301:   PetscLogGpuTimeBegin();
2302: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2303:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2304:   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2305:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2306:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2307:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2308:   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2309:                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2310:                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2311: #else
2312:   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2313:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2314:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2315:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2316:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2317: #endif
2318:   PetscLogGpuFlops(mmdata->flops);
2319:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2320:   PetscLogGpuTimeEnd();
2321:   C->offloadmask = PETSC_OFFLOAD_GPU;
2322: finalize:
2323:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2324:   PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);
2325:   PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2326:   PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);
2327:   c->reallocs         = 0;
2328:   C->info.mallocs    += 0;
2329:   C->info.nz_unneeded = 0;
2330:   C->assembled = C->was_assembled = PETSC_TRUE;
2331:   C->num_ass++;
2332:   return(0);
2333: }

2335: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2336: {
2337:   Mat_Product                  *product = C->product;
2338:   Mat                          A,B;
2339:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2340:   Mat_SeqAIJ                   *a,*b,*c;
2341:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2342:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2343:   PetscInt                     i,j,m,n,k;
2344:   PetscBool                    flg;
2345:   PetscErrorCode               ierr;
2346:   cusparseStatus_t             stat;
2347:   cudaError_t                  cerr;
2348:   MatProductType               ptype;
2349:   MatMatCusparse               *mmdata;
2350:   PetscLogDouble               flops;
2351:   PetscBool                    biscompressed,ciscompressed;
2352: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2353:   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2354:   size_t                       bufSize2;
2355:   cusparseSpMatDescr_t         BmatSpDescr;
2356: #else
2357:   int                          cnz;
2358: #endif

2361:   MatCheckProduct(C,1);
2362:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2363:   A    = product->A;
2364:   B    = product->B;
2365:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2366:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2367:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2368:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2369:   a = (Mat_SeqAIJ*)A->data;
2370:   b = (Mat_SeqAIJ*)B->data;
2371:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2372:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2373:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2374:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");

2376:   /* product data */
2377:   PetscNew(&mmdata);
2378:   C->product->data    = mmdata;
2379:   C->product->destroy = MatDestroy_MatMatCusparse;

2381:   MatSeqAIJCUSPARSECopyToGPU(A);
2382:   MatSeqAIJCUSPARSECopyToGPU(B);
2383:   ptype = product->type;
2384:   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2385:   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2386:   biscompressed = PETSC_FALSE;
2387:   ciscompressed = PETSC_FALSE;
2388:   switch (ptype) {
2389:   case MATPRODUCT_AB:
2390:     m = A->rmap->n;
2391:     n = B->cmap->n;
2392:     k = A->cmap->n;
2393:     Amat = Acusp->mat;
2394:     Bmat = Bcusp->mat;
2395:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2396:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2397:     break;
2398:   case MATPRODUCT_AtB:
2399:     m = A->cmap->n;
2400:     n = B->cmap->n;
2401:     k = A->rmap->n;
2402:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2403:     Amat = Acusp->matTranspose;
2404:     Bmat = Bcusp->mat;
2405:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2406:     break;
2407:   case MATPRODUCT_ABt:
2408:     m = A->rmap->n;
2409:     n = B->rmap->n;
2410:     k = A->cmap->n;
2411:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
2412:     Amat = Acusp->mat;
2413:     Bmat = Bcusp->matTranspose;
2414:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2415:     break;
2416:   default:
2417:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2418:   }

2420:   /* create cusparse matrix */
2421:   MatSetSizes(C,m,n,m,n);
2422:   MatSetType(C,MATSEQAIJCUSPARSE);
2423:   c     = (Mat_SeqAIJ*)C->data;
2424:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2425:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2426:   Ccsr  = new CsrMatrix;

2428:   c->compressedrow.use = ciscompressed;
2429:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2430:     c->compressedrow.nrows = a->compressedrow.nrows;
2431:     PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2432:     PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2433:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2434:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2435:     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2436:   } else {
2437:     c->compressedrow.nrows  = 0;
2438:     c->compressedrow.i      = NULL;
2439:     c->compressedrow.rindex = NULL;
2440:     Ccusp->workVector       = NULL;
2441:     Cmat->cprowIndices      = NULL;
2442:   }
2443:   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2444:   Ccusp->mat      = Cmat;
2445:   Ccusp->mat->mat = Ccsr;
2446:   Ccsr->num_rows    = Ccusp->nrows;
2447:   Ccsr->num_cols    = n;
2448:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2449:   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2450:   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2451:   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2452:   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2453:   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2454:   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2455:   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2456:   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2457:   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2458:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2459:     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2460:     c->nz = 0;
2461:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2462:     Ccsr->values = new THRUSTARRAY(c->nz);
2463:     goto finalizesym;
2464:   }

2466:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2467:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2468:   Acsr = (CsrMatrix*)Amat->mat;
2469:   if (!biscompressed) {
2470:     Bcsr = (CsrMatrix*)Bmat->mat;
2471: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2472:     BmatSpDescr = Bmat->matDescr;
2473: #endif
2474:   } else { /* we need to use row offsets for the full matrix */
2475:     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2476:     Bcsr = new CsrMatrix;
2477:     Bcsr->num_rows       = B->rmap->n;
2478:     Bcsr->num_cols       = cBcsr->num_cols;
2479:     Bcsr->num_entries    = cBcsr->num_entries;
2480:     Bcsr->column_indices = cBcsr->column_indices;
2481:     Bcsr->values         = cBcsr->values;
2482:     if (!Bcusp->rowoffsets_gpu) {
2483:       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2484:       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2485:       PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2486:     }
2487:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2488:     mmdata->Bcsr = Bcsr;
2489: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490:     if (Bcsr->num_rows && Bcsr->num_cols) {
2491:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2492:                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2493:                                Bcsr->values->data().get(),
2494:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496:     }
2497:     BmatSpDescr = mmdata->matSpBDescr;
2498: #endif
2499:   }
2500:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2501:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2502:   /* precompute flops count */
2503:   if (ptype == MATPRODUCT_AB) {
2504:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2505:       const PetscInt st = a->i[i];
2506:       const PetscInt en = a->i[i+1];
2507:       for (j=st; j<en; j++) {
2508:         const PetscInt brow = a->j[j];
2509:         flops += 2.*(b->i[brow+1] - b->i[brow]);
2510:       }
2511:     }
2512:   } else if (ptype == MATPRODUCT_AtB) {
2513:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2514:       const PetscInt anzi = a->i[i+1] - a->i[i];
2515:       const PetscInt bnzi = b->i[i+1] - b->i[i];
2516:       flops += (2.*anzi)*bnzi;
2517:     }
2518:   } else { /* TODO */
2519:     flops = 0.;
2520:   }

2522:   mmdata->flops = flops;
2523:   PetscLogGpuTimeBegin();
2524: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2525:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2526:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2527:                            NULL, NULL, NULL,
2528:                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2529:                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2530:   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2531:   /* ask bufferSize bytes for external memory */
2532:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535:                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2536:   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2537:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2538:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2539:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2540:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2541:                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2542:   /* ask bufferSize again bytes for external memory */
2543:   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2547:   /* The CUSPARSE documentation is not clear, nor the API
2548:      We need both buffers to perform the operations properly!
2549:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2550:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2551:      is stored in the descriptor! What a messy API... */
2552:   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2553:   /* compute the intermediate product of A * B */
2554:   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2557:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2558:   /* get matrix C non-zero entries C_nnz1 */
2559:   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2560:   c->nz = (PetscInt) C_nnz1;
2561:   PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2562:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2563:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2564:   Ccsr->values = new THRUSTARRAY(c->nz);
2565:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2566:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2567:                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2568:   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2569:                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2570:                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2571: #else
2572:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2573:   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2575:                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2576:                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2577:                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2578:   c->nz = cnz;
2579:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2580:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2581:   Ccsr->values = new THRUSTARRAY(c->nz);
2582:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

2584:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2585:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2586:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2587:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2588:   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2589:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2590:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2591:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2592:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2593: #endif
2594:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2595:   PetscLogGpuFlops(mmdata->flops);
2596:   PetscLogGpuTimeEnd();
2597: finalizesym:
2598:   c->singlemalloc = PETSC_FALSE;
2599:   c->free_a       = PETSC_TRUE;
2600:   c->free_ij      = PETSC_TRUE;
2601:   PetscMalloc1(m+1,&c->i);
2602:   PetscMalloc1(c->nz,&c->j);
2603:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2604:     PetscInt *d_i = c->i;
2605:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2606:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2607:     ii   = *Ccsr->row_offsets;
2608:     jj   = *Ccsr->column_indices;
2609:     if (ciscompressed) d_i = c->compressedrow.i;
2610:     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2611:     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2612:   } else {
2613:     PetscInt *d_i = c->i;
2614:     if (ciscompressed) d_i = c->compressedrow.i;
2615:     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616:     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617:   }
2618:   if (ciscompressed) { /* need to expand host row offsets */
2619:     PetscInt r = 0;
2620:     c->i[0] = 0;
2621:     for (k = 0; k < c->compressedrow.nrows; k++) {
2622:       const PetscInt next = c->compressedrow.rindex[k];
2623:       const PetscInt old = c->compressedrow.i[k];
2624:       for (; r < next; r++) c->i[r+1] = old;
2625:     }
2626:     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2627:   }
2628:   PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2629:   PetscMalloc1(m,&c->ilen);
2630:   PetscMalloc1(m,&c->imax);
2631:   c->maxnz = c->nz;
2632:   c->nonzerorowcnt = 0;
2633:   c->rmax = 0;
2634:   for (k = 0; k < m; k++) {
2635:     const PetscInt nn = c->i[k+1] - c->i[k];
2636:     c->ilen[k] = c->imax[k] = nn;
2637:     c->nonzerorowcnt += (PetscInt)!!nn;
2638:     c->rmax = PetscMax(c->rmax,nn);
2639:   }
2640:   MatMarkDiagonal_SeqAIJ(C);
2641:   PetscMalloc1(c->nz,&c->a);
2642:   Ccsr->num_entries = c->nz;

2644:   C->nonzerostate++;
2645:   PetscLayoutSetUp(C->rmap);
2646:   PetscLayoutSetUp(C->cmap);
2647:   Ccusp->nonzerostate = C->nonzerostate;
2648:   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2649:   C->preallocated  = PETSC_TRUE;
2650:   C->assembled     = PETSC_FALSE;
2651:   C->was_assembled = PETSC_FALSE;
2652:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2653:     mmdata->reusesym = PETSC_TRUE;
2654:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2655:   }
2656:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2657:   return(0);
2658: }

2660: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2662: /* handles sparse or dense B */
2663: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2664: {
2665:   Mat_Product    *product = mat->product;
2667:   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;

2670:   MatCheckProduct(mat,1);
2671:   PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2672:   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2673:     PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2674:   }
2675:   if (product->type == MATPRODUCT_ABC) {
2676:     Ciscusp = PETSC_FALSE;
2677:     if (!product->C->boundtocpu) {
2678:       PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2679:     }
2680:   }
2681:   if (isdense) {
2682:     switch (product->type) {
2683:     case MATPRODUCT_AB:
2684:     case MATPRODUCT_AtB:
2685:     case MATPRODUCT_ABt:
2686:     case MATPRODUCT_PtAP:
2687:     case MATPRODUCT_RARt:
2688:      if (product->A->boundtocpu) {
2689:         MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2690:       } else {
2691:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2692:       }
2693:       break;
2694:     case MATPRODUCT_ABC:
2695:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2696:       break;
2697:     default:
2698:       break;
2699:     }
2700:   } else if (Biscusp && Ciscusp) {
2701:     switch (product->type) {
2702:     case MATPRODUCT_AB:
2703:     case MATPRODUCT_AtB:
2704:     case MATPRODUCT_ABt:
2705:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2706:       break;
2707:     case MATPRODUCT_PtAP:
2708:     case MATPRODUCT_RARt:
2709:     case MATPRODUCT_ABC:
2710:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2711:       break;
2712:     default:
2713:       break;
2714:     }
2715:   } else { /* fallback for AIJ */
2716:     MatProductSetFromOptions_SeqAIJ(mat);
2717:   }
2718:   return(0);
2719: }

2721: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2722: {

2726:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2727:   return(0);
2728: }

2730: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2731: {

2735:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2736:   return(0);
2737: }

2739: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2740: {

2744:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2745:   return(0);
2746: }

2748: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2749: {

2753:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2754:   return(0);
2755: }

2757: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2758: {

2762:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2763:   return(0);
2764: }

2766: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2767: {
2768:   int i = blockIdx.x*blockDim.x + threadIdx.x;
2769:   if (i < n) y[idx[i]] += x[i];
2770: }

2772: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2773: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2774: {
2775:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2776:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2777:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2778:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2779:   PetscErrorCode               ierr;
2780:   cudaError_t                  cerr;
2781:   cusparseStatus_t             stat;
2782:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2783:   PetscBool                    compressed;
2784: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2785:   PetscInt                     nx,ny;
2786: #endif

2789:   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2790:   if (!a->nonzerorowcnt) {
2791:     if (!yy) {VecSet_SeqCUDA(zz,0);}
2792:     else {VecCopy_SeqCUDA(yy,zz);}
2793:     return(0);
2794:   }
2795:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2796:   MatSeqAIJCUSPARSECopyToGPU(A);
2797:   if (!trans) {
2798:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2799:     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2800:   } else {
2801:     if (herm || !A->form_explicit_transpose) {
2802:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2803:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2804:     } else {
2805:       if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);}
2806:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2807:     }
2808:   }
2809:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2810:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2812:   try {
2813:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2814:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2815:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */

2817:     PetscLogGpuTimeBegin();
2818:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2819:       /* z = A x + beta y.
2820:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2821:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2822:       */
2823:       xptr = xarray;
2824:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2825:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2826:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2827:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2828:           allocated to accommodate different uses. So we get the length info directly from mat.
2829:        */
2830:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2831:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2832:         nx = mat->num_cols;
2833:         ny = mat->num_rows;
2834:       }
2835:      #endif
2836:     } else {
2837:       /* z = A^T x + beta y
2838:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2839:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2840:        */
2841:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2842:       dptr = zarray;
2843:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2844:       if (compressed) { /* Scatter x to work vector */
2845:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2846:         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2847:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2848:                          VecCUDAEqualsReverse());
2849:       }
2850:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2851:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2852:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2853:         nx = mat->num_rows;
2854:         ny = mat->num_cols;
2855:       }
2856:      #endif
2857:     }

2859:     /* csr_spmv does y = alpha op(A) x + beta y */
2860:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2861:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2862:       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2863:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2864:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2865:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2866:         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2867:                                 matstruct->matDescr,
2868:                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2869:                                 matstruct->cuSpMV[opA].vecYDescr,
2870:                                 cusparse_scalartype,
2871:                                 cusparsestruct->spmvAlg,
2872:                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2873:         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);

2875:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2876:       } else {
2877:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2878:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2879:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2880:       }

2882:       stat = cusparseSpMV(cusparsestruct->handle, opA,
2883:                                matstruct->alpha_one,
2884:                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2885:                                matstruct->cuSpMV[opA].vecXDescr,
2886:                                beta,
2887:                                matstruct->cuSpMV[opA].vecYDescr,
2888:                                cusparse_scalartype,
2889:                                cusparsestruct->spmvAlg,
2890:                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2891:      #else
2892:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2893:       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2894:                                mat->num_rows, mat->num_cols,
2895:                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2896:                                mat->values->data().get(), mat->row_offsets->data().get(),
2897:                                mat->column_indices->data().get(), xptr, beta,
2898:                                dptr);CHKERRCUSPARSE(stat);
2899:      #endif
2900:     } else {
2901:       if (cusparsestruct->nrows) {
2902:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2903:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2904:        #else
2905:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2906:         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2907:                                  matstruct->alpha_one, matstruct->descr, hybMat,
2908:                                  xptr, beta,
2909:                                  dptr);CHKERRCUSPARSE(stat);
2910:        #endif
2911:       }
2912:     }
2913:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2914:     PetscLogGpuTimeEnd();

2916:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2917:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2918:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2919:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2920:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2921:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2922:         }
2923:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2924:         VecSet_SeqCUDA(zz,0);
2925:       }

2927:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2928:       if (compressed) {
2929:         PetscLogGpuTimeBegin();
2930:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2931:            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2932:            prevent that. So I just add a ScatterAdd kernel.
2933:          */
2934:        #if 0
2935:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2936:         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2937:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2938:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2939:                          VecCUDAPlusEquals());
2940:        #else
2941:         PetscInt n = matstruct->cprowIndices->size();
2942:         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2943:        #endif
2944:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2945:         PetscLogGpuTimeEnd();
2946:       }
2947:     } else {
2948:       if (yy && yy != zz) {
2949:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2950:       }
2951:     }
2952:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
2953:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
2954:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
2955:   } catch(char *ex) {
2956:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2957:   }
2958:   if (yy) {
2959:     PetscLogGpuFlops(2.0*a->nz);
2960:   } else {
2961:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
2962:   }
2963:   return(0);
2964: }

2966: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2967: {

2971:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
2972:   return(0);
2973: }

2975: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2976: {
2977:   PetscErrorCode              ierr;
2978:   PetscSplitCSRDataStructure  *d_mat = NULL;
2980:   if (A->factortype == MAT_FACTOR_NONE) {
2981:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2982:   }
2983:   MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
2984:   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
2985:   if (d_mat) {
2986:     A->offloadmask = PETSC_OFFLOAD_GPU;
2987:   }

2989:   return(0);
2990: }

2992: /* --------------------------------------------------------------------------------*/
2993: /*@
2994:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2995:    (the default parallel PETSc format). This matrix will ultimately pushed down
2996:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2997:    assembly performance the user should preallocate the matrix storage by setting
2998:    the parameter nz (or the array nnz).  By setting these parameters accurately,
2999:    performance during matrix assembly can be increased by more than a factor of 50.

3001:    Collective

3003:    Input Parameters:
3004: +  comm - MPI communicator, set to PETSC_COMM_SELF
3005: .  m - number of rows
3006: .  n - number of columns
3007: .  nz - number of nonzeros per row (same for all rows)
3008: -  nnz - array containing the number of nonzeros in the various rows
3009:          (possibly different for each row) or NULL

3011:    Output Parameter:
3012: .  A - the matrix

3014:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3015:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3016:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

3018:    Notes:
3019:    If nnz is given then nz is ignored

3021:    The AIJ format (also called the Yale sparse matrix format or
3022:    compressed row storage), is fully compatible with standard Fortran 77
3023:    storage.  That is, the stored row and column indices can begin at
3024:    either one (as in Fortran) or zero.  See the users' manual for details.

3026:    Specify the preallocated storage with either nz or nnz (not both).
3027:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3028:    allocation.  For large problems you MUST preallocate memory or you
3029:    will get TERRIBLE performance, see the users' manual chapter on matrices.

3031:    By default, this format uses inodes (identical nodes) when possible, to
3032:    improve numerical efficiency of matrix-vector products and solves. We
3033:    search for consecutive rows with the same nonzero structure, thereby
3034:    reusing matrix information to achieve increased efficiency.

3036:    Level: intermediate

3038: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3039: @*/
3040: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3041: {

3045:   MatCreate(comm,A);
3046:   MatSetSizes(*A,m,n,m,n);
3047:   MatSetType(*A,MATSEQAIJCUSPARSE);
3048:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3049:   return(0);
3050: }

3052: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3053: {
3054:   PetscErrorCode              ierr;
3055:   PetscSplitCSRDataStructure  *d_mat = NULL;

3058:   if (A->factortype == MAT_FACTOR_NONE) {
3059:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3060:     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3061:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3062:   } else {
3063:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3064:   }
3065:   if (d_mat) {
3066:     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3067:     cudaError_t                err;
3068:     PetscSplitCSRDataStructure h_mat;
3069:     PetscInfo(A,"Have device matrix\n");
3070:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3071:     if (a->compressedrow.use) {
3072:       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3073:     }
3074:     err = cudaFree(d_mat);CHKERRCUDA(err);
3075:   }
3076:   PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3077:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3078:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3079:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3080:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3081:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3082:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3083:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3084:   MatDestroy_SeqAIJ(A);
3085:   return(0);
3086: }

3088: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3089: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3090: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3091: {

3095:   MatDuplicate_SeqAIJ(A,cpvalues,B);
3096:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3097:   return(0);
3098: }

3100: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3101: {
3102:   PetscErrorCode     ierr;
3103:   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3104:   Mat_SeqAIJCUSPARSE *cy;
3105:   Mat_SeqAIJCUSPARSE *cx;
3106:   PetscScalar        *ay;
3107:   const PetscScalar  *ax;
3108:   CsrMatrix          *csry,*csrx;
3109:   cudaError_t        cerr;

3112:   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3113:   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3114:   if (X->ops->axpy != Y->ops->axpy) {
3115:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3116:     MatAXPY_SeqAIJ(Y,a,X,str);
3117:     return(0);
3118:   }
3119:   /* if we are here, it means both matrices are bound to GPU */
3120:   MatSeqAIJCUSPARSECopyToGPU(Y);
3121:   MatSeqAIJCUSPARSECopyToGPU(X);
3122:   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3123:   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3124:   csry = (CsrMatrix*)cy->mat->mat;
3125:   csrx = (CsrMatrix*)cx->mat->mat;
3126:   /* see if we can turn this into a cublas axpy */
3127:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3128:     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3129:     if (eq) {
3130:       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3131:     }
3132:     if (eq) str = SAME_NONZERO_PATTERN;
3133:   }
3134:   /* spgeam is buggy with one column */
3135:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3137:   if (str == SUBSET_NONZERO_PATTERN) {
3138:     cusparseStatus_t stat;
3139:     PetscScalar      b = 1.0;
3140: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3141:     size_t           bufferSize;
3142:     void             *buffer;
3143: #endif

3145:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3146:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3147:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3148: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3149:     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3150:                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3151:                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3152:                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3153:     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3154:     PetscLogGpuTimeBegin();
3155:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3156:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3157:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3158:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3159:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3160:     PetscLogGpuFlops(x->nz + y->nz);
3161:     PetscLogGpuTimeEnd();
3162:     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3163: #else
3164:     PetscLogGpuTimeBegin();
3165:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3166:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3167:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3168:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3169:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3170:     PetscLogGpuFlops(x->nz + y->nz);
3171:     PetscLogGpuTimeEnd();
3172: #endif
3173:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3174:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3175:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3176:     MatSeqAIJInvalidateDiagonal(Y);
3177:   } else if (str == SAME_NONZERO_PATTERN) {
3178:     cublasHandle_t cublasv2handle;
3179:     cublasStatus_t berr;
3180:     PetscBLASInt   one = 1, bnz = 1;

3182:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3183:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3184:     PetscCUBLASGetHandle(&cublasv2handle);
3185:     PetscBLASIntCast(x->nz,&bnz);
3186:     PetscLogGpuTimeBegin();
3187:     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3188:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3189:     PetscLogGpuFlops(2.0*bnz);
3190:     PetscLogGpuTimeEnd();
3191:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3192:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3193:     MatSeqAIJInvalidateDiagonal(Y);
3194:   } else {
3195:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3196:     MatAXPY_SeqAIJ(Y,a,X,str);
3197:   }
3198:   return(0);
3199: }

3201: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3202: {
3204:   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3205:   PetscScalar    *ay;
3206:   cudaError_t    cerr;
3207:   cublasHandle_t cublasv2handle;
3208:   cublasStatus_t berr;
3209:   PetscBLASInt   one = 1, bnz = 1;

3212:   MatSeqAIJCUSPARSEGetArray(Y,&ay);
3213:   PetscCUBLASGetHandle(&cublasv2handle);
3214:   PetscBLASIntCast(y->nz,&bnz);
3215:   PetscLogGpuTimeBegin();
3216:   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3217:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3218:   PetscLogGpuFlops(bnz);
3219:   PetscLogGpuTimeEnd();
3220:   MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3221:   MatSeqAIJInvalidateDiagonal(Y);
3222:   return(0);
3223: }

3225: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3226: {
3227:   PetscErrorCode             ierr;
3228:   PetscBool                  both = PETSC_FALSE;
3229:   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;

3232:   if (A->factortype == MAT_FACTOR_NONE) {
3233:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3234:     if (spptr->mat) {
3235:       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3236:       if (matrix->values) {
3237:         both = PETSC_TRUE;
3238:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3239:       }
3240:     }
3241:     if (spptr->matTranspose) {
3242:       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3243:       if (matrix->values) {
3244:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3245:       }
3246:     }
3247:   }
3248:   //MatZeroEntries_SeqAIJ(A);
3249:   PetscArrayzero(a->a,a->i[A->rmap->n]);
3250:   MatSeqAIJInvalidateDiagonal(A);
3251:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3252:   else A->offloadmask = PETSC_OFFLOAD_CPU;

3254:   return(0);
3255: }

3257: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3258: {
3259:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

3263:   if (A->factortype != MAT_FACTOR_NONE) return(0);
3264:   if (flg) {
3265:     MatSeqAIJCUSPARSECopyFromGPU(A);

3267:     A->ops->scale                     = MatScale_SeqAIJ;
3268:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3269:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3270:     A->ops->mult                      = MatMult_SeqAIJ;
3271:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3272:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3273:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3274:     A->ops->multhermitiantranspose    = NULL;
3275:     A->ops->multhermitiantransposeadd = NULL;
3276:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3277:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3278:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3279:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3280:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3281:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3282:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3283:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3284:   } else {
3285:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3286:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3287:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3288:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3289:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3290:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3291:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3292:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3293:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3294:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3295:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3296:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3297:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3298:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3299:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3300:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
3301:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3302:   }
3303:   A->boundtocpu = flg;
3304:   a->inode.use = flg;
3305:   return(0);
3306: }

3308: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3309: {
3310:   PetscErrorCode   ierr;
3311:   cusparseStatus_t stat;
3312:   Mat              B;

3315:   PetscCUDAInitializeCheck(); /* first use of CUSPARSE may be via MatConvert */
3316:   if (reuse == MAT_INITIAL_MATRIX) {
3317:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
3318:   } else if (reuse == MAT_REUSE_MATRIX) {
3319:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3320:   }
3321:   B = *newmat;

3323:   PetscFree(B->defaultvectype);
3324:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

3326:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3327:     if (B->factortype == MAT_FACTOR_NONE) {
3328:       Mat_SeqAIJCUSPARSE *spptr;
3329:       PetscNew(&spptr);
3330:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3331:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3332:       spptr->format     = MAT_CUSPARSE_CSR;
3333:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3334:       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3335:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3336:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3337:      #endif
3338:       B->spptr = spptr;
3339:     } else {
3340:       Mat_SeqAIJCUSPARSETriFactors *spptr;

3342:       PetscNew(&spptr);
3343:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3344:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3345:       B->spptr = spptr;
3346:     }
3347:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3348:   }
3349:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3350:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3351:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3352:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3353:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3354:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

3356:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3357:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3358:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3359:   return(0);
3360: }

3362: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3363: {

3367:   MatCreate_SeqAIJ(B);
3368:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3369:   return(0);
3370: }

3372: /*MC
3373:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

3375:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3376:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3377:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

3379:    Options Database Keys:
3380: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3381: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3382: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

3384:   Level: beginner

3386: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3387: M*/

3389: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);

3391: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3392: {

3396:   MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3397:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3398:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3399:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3400:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);

3402:   return(0);
3403: }

3405: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3406: {
3407:   PetscErrorCode   ierr;
3408:   cusparseStatus_t stat;

3411:   if (*cusparsestruct) {
3412:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3413:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3414:     delete (*cusparsestruct)->workVector;
3415:     delete (*cusparsestruct)->rowoffsets_gpu;
3416:     delete (*cusparsestruct)->cooPerm;
3417:     delete (*cusparsestruct)->cooPerm_a;
3418:     delete (*cusparsestruct)->csr2csc_i;
3419:     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3420:     PetscFree(*cusparsestruct);
3421:   }
3422:   return(0);
3423: }

3425: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3426: {
3428:   if (*mat) {
3429:     delete (*mat)->values;
3430:     delete (*mat)->column_indices;
3431:     delete (*mat)->row_offsets;
3432:     delete *mat;
3433:     *mat = 0;
3434:   }
3435:   return(0);
3436: }

3438: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3439: {
3440:   cusparseStatus_t stat;
3441:   PetscErrorCode   ierr;

3444:   if (*trifactor) {
3445:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3446:     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3447:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
3448:     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3449:     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3450:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3451:     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3452:    #endif
3453:     PetscFree(*trifactor);
3454:   }
3455:   return(0);
3456: }

3458: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3459: {
3460:   CsrMatrix        *mat;
3461:   cusparseStatus_t stat;
3462:   cudaError_t      err;

3465:   if (*matstruct) {
3466:     if ((*matstruct)->mat) {
3467:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3468:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3469:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3470:        #else
3471:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3472:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3473:        #endif
3474:       } else {
3475:         mat = (CsrMatrix*)(*matstruct)->mat;
3476:         CsrMatrix_Destroy(&mat);
3477:       }
3478:     }
3479:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3480:     delete (*matstruct)->cprowIndices;
3481:     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3482:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3483:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }

3485:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3486:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3487:     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3488:     for (int i=0; i<3; i++) {
3489:       if (mdata->cuSpMV[i].initialized) {
3490:         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3491:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3492:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3493:       }
3494:     }
3495:    #endif
3496:     delete *matstruct;
3497:     *matstruct = NULL;
3498:   }
3499:   return(0);
3500: }

3502: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3503: {

3507:   if (*trifactors) {
3508:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3509:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3510:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3511:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3512:     delete (*trifactors)->rpermIndices;
3513:     delete (*trifactors)->cpermIndices;
3514:     delete (*trifactors)->workVector;
3515:     (*trifactors)->rpermIndices = NULL;
3516:     (*trifactors)->cpermIndices = NULL;
3517:     (*trifactors)->workVector = NULL;
3518:     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3519:     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3520:   }
3521:   return(0);
3522: }

3524: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3525: {
3526:   PetscErrorCode   ierr;
3527:   cusparseHandle_t handle;
3528:   cusparseStatus_t stat;

3531:   if (*trifactors) {
3532:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3533:     if (handle = (*trifactors)->handle) {
3534:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3535:     }
3536:     PetscFree(*trifactors);
3537:   }
3538:   return(0);
3539: }

3541: struct IJCompare
3542: {
3543:   __host__ __device__
3544:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3545:   {
3546:     if (t1.get<0>() < t2.get<0>()) return true;
3547:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3548:     return false;
3549:   }
3550: };

3552: struct IJEqual
3553: {
3554:   __host__ __device__
3555:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3556:   {
3557:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3558:     return true;
3559:   }
3560: };

3562: struct IJDiff
3563: {
3564:   __host__ __device__
3565:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3566:   {
3567:     return t1 == t2 ? 0 : 1;
3568:   }
3569: };

3571: struct IJSum
3572: {
3573:   __host__ __device__
3574:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3575:   {
3576:     return t1||t2;
3577:   }
3578: };

3580: #include <thrust/iterator/discard_iterator.h>
3581: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3582: {
3583:   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3584:   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3585:   THRUSTARRAY                           *cooPerm_v = NULL;
3586:   thrust::device_ptr<const PetscScalar> d_v;
3587:   CsrMatrix                             *matrix;
3588:   PetscErrorCode                        ierr;
3589:   cudaError_t                           cerr;
3590:   PetscInt                              n;

3593:   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3594:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3595:   if (!cusp->cooPerm) {
3596:     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3597:     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3598:     return(0);
3599:   }
3600:   matrix = (CsrMatrix*)cusp->mat->mat;
3601:   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3602:   if (!v) {
3603:     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3604:     goto finalize;
3605:   }
3606:   n = cusp->cooPerm->size();
3607:   if (isCudaMem(v)) {
3608:     d_v = thrust::device_pointer_cast(v);
3609:   } else {
3610:     cooPerm_v = new THRUSTARRAY(n);
3611:     cooPerm_v->assign(v,v+n);
3612:     d_v = cooPerm_v->data();
3613:     PetscLogCpuToGpu(n*sizeof(PetscScalar));
3614:   }
3615:   PetscLogGpuTimeBegin();
3616:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3617:     if (cusp->cooPerm_a) {
3618:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3619:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3620:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3621:       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3622:       delete cooPerm_w;
3623:     } else {
3624:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3625:                                                                 matrix->values->begin()));
3626:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3627:                                                                 matrix->values->end()));
3628:       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3629:     }
3630:   } else {
3631:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3632:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3633:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3634:     } else {
3635:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3636:                                                                 matrix->values->begin()));
3637:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3638:                                                                 matrix->values->end()));
3639:       thrust::for_each(zibit,zieit,VecCUDAEquals());
3640:     }
3641:   }
3642:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3643:   PetscLogGpuTimeEnd();
3644: finalize:
3645:   delete cooPerm_v;
3646:   A->offloadmask = PETSC_OFFLOAD_GPU;
3647:   PetscObjectStateIncrease((PetscObject)A);
3648:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3649:   PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);
3650:   PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3651:   PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);
3652:   a->reallocs         = 0;
3653:   A->info.mallocs    += 0;
3654:   A->info.nz_unneeded = 0;
3655:   A->assembled = A->was_assembled = PETSC_TRUE;
3656:   A->num_ass++;
3657:   return(0);
3658: }

3660: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3661: {
3662:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3663:   PetscErrorCode     ierr;

3667:   if (!cusp) return(0);
3668:   if (destroy) {
3669:     MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3670:     delete cusp->csr2csc_i;
3671:     cusp->csr2csc_i = NULL;
3672:   }
3673:   A->transupdated = PETSC_FALSE;
3674:   return(0);
3675: }

3677: #include <thrust/binary_search.h>
3678: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3679: {
3680:   PetscErrorCode     ierr;
3681:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3682:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3683:   PetscInt           cooPerm_n, nzr = 0;
3684:   cudaError_t        cerr;

3687:   PetscLayoutSetUp(A->rmap);
3688:   PetscLayoutSetUp(A->cmap);
3689:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3690:   if (n != cooPerm_n) {
3691:     delete cusp->cooPerm;
3692:     delete cusp->cooPerm_a;
3693:     cusp->cooPerm = NULL;
3694:     cusp->cooPerm_a = NULL;
3695:   }
3696:   if (n) {
3697:     THRUSTINTARRAY d_i(n);
3698:     THRUSTINTARRAY d_j(n);
3699:     THRUSTINTARRAY ii(A->rmap->n);

3701:     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3702:     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }

3704:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3705:     d_i.assign(coo_i,coo_i+n);
3706:     d_j.assign(coo_j,coo_j+n);
3707:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3708:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));

3710:     PetscLogGpuTimeBegin();
3711:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3712:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3713:     *cusp->cooPerm_a = d_i;
3714:     THRUSTINTARRAY w = d_j;

3716:     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3717:     if (nekey == ekey) { /* all entries are unique */
3718:       delete cusp->cooPerm_a;
3719:       cusp->cooPerm_a = NULL;
3720:     } else { /* I couldn't come up with a more elegant algorithm */
3721:       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3722:       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3723:       (*cusp->cooPerm_a)[0] = 0;
3724:       w[0] = 0;
3725:       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3726:       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3727:     }
3728:     thrust::counting_iterator<PetscInt> search_begin(0);
3729:     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3730:                         search_begin, search_begin + A->rmap->n,
3731:                         ii.begin());
3732:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3733:     PetscLogGpuTimeEnd();

3735:     MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3736:     a->singlemalloc = PETSC_FALSE;
3737:     a->free_a       = PETSC_TRUE;
3738:     a->free_ij      = PETSC_TRUE;
3739:     PetscMalloc1(A->rmap->n+1,&a->i);
3740:     a->i[0] = 0;
3741:     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3742:     a->nz = a->maxnz = a->i[A->rmap->n];
3743:     a->rmax = 0;
3744:     PetscMalloc1(a->nz,&a->a);
3745:     PetscMalloc1(a->nz,&a->j);
3746:     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3747:     if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3748:     if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3749:     for (PetscInt i = 0; i < A->rmap->n; i++) {
3750:       const PetscInt nnzr = a->i[i+1] - a->i[i];
3751:       nzr += (PetscInt)!!(nnzr);
3752:       a->ilen[i] = a->imax[i] = nnzr;
3753:       a->rmax = PetscMax(a->rmax,nnzr);
3754:     }
3755:     a->nonzerorowcnt = nzr;
3756:     A->preallocated = PETSC_TRUE;
3757:     PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3758:     MatMarkDiagonal_SeqAIJ(A);
3759:   } else {
3760:     MatSeqAIJSetPreallocation(A,0,NULL);
3761:   }
3762:   MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);

3764:   /* We want to allocate the CUSPARSE struct for matvec now.
3765:      The code is so convoluted now that I prefer to copy zeros */
3766:   PetscArrayzero(a->a,a->nz);
3767:   MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3768:   A->offloadmask = PETSC_OFFLOAD_CPU;
3769:   A->nonzerostate++;
3770:   MatSeqAIJCUSPARSECopyToGPU(A);
3771:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);

3773:   A->assembled = PETSC_FALSE;
3774:   A->was_assembled = PETSC_FALSE;
3775:   return(0);
3776: }

3778: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3779: {
3780:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3781:   CsrMatrix          *csr;
3782:   PetscErrorCode     ierr;

3788:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3789:   MatSeqAIJCUSPARSECopyToGPU(A);
3790:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3791:   csr = (CsrMatrix*)cusp->mat->mat;
3792:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3793:   *a = csr->values->data().get();
3794:   return(0);
3795: }

3797: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3798: {
3803:   *a = NULL;
3804:   return(0);
3805: }

3807: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3808: {
3809:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3810:   CsrMatrix          *csr;
3811:   PetscErrorCode     ierr;

3817:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3818:   MatSeqAIJCUSPARSECopyToGPU(A);
3819:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3820:   csr = (CsrMatrix*)cusp->mat->mat;
3821:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3822:   *a = csr->values->data().get();
3823:   A->offloadmask = PETSC_OFFLOAD_GPU;
3824:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3825:   return(0);
3826: }

3828: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3829: {

3836:   PetscObjectStateIncrease((PetscObject)A);
3837:   *a = NULL;
3838:   return(0);
3839: }

3841: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3842: {
3843:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3844:   CsrMatrix          *csr;
3845:   PetscErrorCode     ierr;

3851:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3852:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3853:   csr = (CsrMatrix*)cusp->mat->mat;
3854:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3855:   *a = csr->values->data().get();
3856:   A->offloadmask = PETSC_OFFLOAD_GPU;
3857:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3858:   return(0);
3859: }

3861: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3862: {

3869:   PetscObjectStateIncrease((PetscObject)A);
3870:   *a = NULL;
3871:   return(0);
3872: }

3874: struct IJCompare4
3875: {
3876:   __host__ __device__
3877:   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3878:   {
3879:     if (t1.get<0>() < t2.get<0>()) return true;
3880:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3881:     return false;
3882:   }
3883: };

3885: struct Shift
3886: {
3887:   int _shift;

3889:   Shift(int shift) : _shift(shift) {}
3890:   __host__ __device__
3891:   inline int operator() (const int &c)
3892:   {
3893:     return c + _shift;
3894:   }
3895: };

3897: /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3898: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3899: {
3900:   PetscErrorCode               ierr;
3901:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3902:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3903:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3904:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3905:   PetscInt                     Annz,Bnnz;
3906:   cusparseStatus_t             stat;
3907:   PetscInt                     i,m,n,zero = 0;
3908:   cudaError_t                  cerr;

3916:   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3917:   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3918:   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3919:   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3920:   if (reuse == MAT_INITIAL_MATRIX) {
3921:     m     = A->rmap->n;
3922:     n     = A->cmap->n + B->cmap->n;
3923:     MatCreate(PETSC_COMM_SELF,C);
3924:     MatSetSizes(*C,m,n,m,n);
3925:     MatSetType(*C,MATSEQAIJCUSPARSE);
3926:     c     = (Mat_SeqAIJ*)(*C)->data;
3927:     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3928:     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3929:     Ccsr  = new CsrMatrix;
3930:     Cmat->cprowIndices      = NULL;
3931:     c->compressedrow.use    = PETSC_FALSE;
3932:     c->compressedrow.nrows  = 0;
3933:     c->compressedrow.i      = NULL;
3934:     c->compressedrow.rindex = NULL;
3935:     Ccusp->workVector       = NULL;
3936:     Ccusp->nrows    = m;
3937:     Ccusp->mat      = Cmat;
3938:     Ccusp->mat->mat = Ccsr;
3939:     Ccsr->num_rows  = m;
3940:     Ccsr->num_cols  = n;
3941:     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3942:     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3943:     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3944:     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3945:     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3946:     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3947:     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3948:     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3949:     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3950:     MatSeqAIJCUSPARSECopyToGPU(A);
3951:     MatSeqAIJCUSPARSECopyToGPU(B);
3952:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
3953:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
3954:     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3955:     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");

3957:     Acsr = (CsrMatrix*)Acusp->mat->mat;
3958:     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3959:     Annz = (PetscInt)Acsr->column_indices->size();
3960:     Bnnz = (PetscInt)Bcsr->column_indices->size();
3961:     c->nz = Annz + Bnnz;
3962:     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3963:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3964:     Ccsr->values = new THRUSTARRAY(c->nz);
3965:     Ccsr->num_entries = c->nz;
3966:     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3967:     if (c->nz) {
3968:       auto Acoo = new THRUSTINTARRAY32(Annz);
3969:       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3970:       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3971:       THRUSTINTARRAY32 *Aroff,*Broff;

3973:       if (a->compressedrow.use) { /* need full row offset */
3974:         if (!Acusp->rowoffsets_gpu) {
3975:           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3976:           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3977:           PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
3978:         }
3979:         Aroff = Acusp->rowoffsets_gpu;
3980:       } else Aroff = Acsr->row_offsets;
3981:       if (b->compressedrow.use) { /* need full row offset */
3982:         if (!Bcusp->rowoffsets_gpu) {
3983:           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3984:           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3985:           PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
3986:         }
3987:         Broff = Bcusp->rowoffsets_gpu;
3988:       } else Broff = Bcsr->row_offsets;
3989:       PetscLogGpuTimeBegin();
3990:       stat = cusparseXcsr2coo(Acusp->handle,
3991:                               Aroff->data().get(),
3992:                               Annz,
3993:                               m,
3994:                               Acoo->data().get(),
3995:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3996:       stat = cusparseXcsr2coo(Bcusp->handle,
3997:                               Broff->data().get(),
3998:                               Bnnz,
3999:                               m,
4000:                               Bcoo->data().get(),
4001:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4002:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4003:       auto Aperm = thrust::make_constant_iterator(1);
4004:       auto Bperm = thrust::make_constant_iterator(0);
4005: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4006:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4007:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4008: #else
4009:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4010:       auto Bcib = Bcsr->column_indices->begin();
4011:       auto Bcie = Bcsr->column_indices->end();
4012:       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4013: #endif
4014:       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4015:       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4016:       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4017:       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4018:       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4019:       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4020:       auto p1 = Ccusp->cooPerm->begin();
4021:       auto p2 = Ccusp->cooPerm->begin();
4022:       thrust::advance(p2,Annz);
4023:       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4024: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4025:       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4026: #endif
4027:       auto cci = thrust::make_counting_iterator(zero);
4028:       auto cce = thrust::make_counting_iterator(c->nz);
4029: #if 0 //Errors on SUMMIT cuda 11.1.0
4030:       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4031: #else
4032:       auto pred = thrust::identity<int>();
4033:       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4034:       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4035: #endif
4036:       stat = cusparseXcoo2csr(Ccusp->handle,
4037:                               Ccoo->data().get(),
4038:                               c->nz,
4039:                               m,
4040:                               Ccsr->row_offsets->data().get(),
4041:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4042:       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4043:       PetscLogGpuTimeEnd();
4044:       delete wPerm;
4045:       delete Acoo;
4046:       delete Bcoo;
4047:       delete Ccoo;
4048: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4049:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4050:                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4051:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4052:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4053: #endif
4054:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4055:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4056:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4057:         CsrMatrix *CcsrT = new CsrMatrix;
4058:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4059:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;

4061:         (*C)->form_explicit_transpose = PETSC_TRUE;
4062:         (*C)->transupdated = PETSC_TRUE;
4063:         Ccusp->rowoffsets_gpu = NULL;
4064:         CmatT->cprowIndices = NULL;
4065:         CmatT->mat = CcsrT;
4066:         CcsrT->num_rows = n;
4067:         CcsrT->num_cols = m;
4068:         CcsrT->num_entries = c->nz;

4070:         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4071:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4072:         CcsrT->values = new THRUSTARRAY(c->nz);

4074:         PetscLogGpuTimeBegin();
4075:         auto rT = CcsrT->row_offsets->begin();
4076:         if (AT) {
4077:           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4078:           thrust::advance(rT,-1);
4079:         }
4080:         if (BT) {
4081:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4082:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4083:           thrust::copy(titb,tite,rT);
4084:         }
4085:         auto cT = CcsrT->column_indices->begin();
4086:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4087:         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4088:         auto vT = CcsrT->values->begin();
4089:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4090:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4091:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4092:         PetscLogGpuTimeEnd();

4094:         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4095:         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4096:         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4097:         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4098:         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4099:         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4100:         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4101:         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4102:         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4103: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4104:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4105:                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4106:                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4107:                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4108: #endif
4109:         Ccusp->matTranspose = CmatT;
4110:       }
4111:     }

4113:     c->singlemalloc = PETSC_FALSE;
4114:     c->free_a       = PETSC_TRUE;
4115:     c->free_ij      = PETSC_TRUE;
4116:     PetscMalloc1(m+1,&c->i);
4117:     PetscMalloc1(c->nz,&c->j);
4118:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4119:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4120:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4121:       ii   = *Ccsr->row_offsets;
4122:       jj   = *Ccsr->column_indices;
4123:       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4124:       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4125:     } else {
4126:       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4127:       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4128:     }
4129:     PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4130:     PetscMalloc1(m,&c->ilen);
4131:     PetscMalloc1(m,&c->imax);
4132:     c->maxnz = c->nz;
4133:     c->nonzerorowcnt = 0;
4134:     c->rmax = 0;
4135:     for (i = 0; i < m; i++) {
4136:       const PetscInt nn = c->i[i+1] - c->i[i];
4137:       c->ilen[i] = c->imax[i] = nn;
4138:       c->nonzerorowcnt += (PetscInt)!!nn;
4139:       c->rmax = PetscMax(c->rmax,nn);
4140:     }
4141:     MatMarkDiagonal_SeqAIJ(*C);
4142:     PetscMalloc1(c->nz,&c->a);
4143:     (*C)->nonzerostate++;
4144:     PetscLayoutSetUp((*C)->rmap);
4145:     PetscLayoutSetUp((*C)->cmap);
4146:     Ccusp->nonzerostate = (*C)->nonzerostate;
4147:     (*C)->preallocated  = PETSC_TRUE;
4148:   } else {
4149:     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4150:     c = (Mat_SeqAIJ*)(*C)->data;
4151:     if (c->nz) {
4152:       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4153:       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4154:       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4155:       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4156:       MatSeqAIJCUSPARSECopyToGPU(A);
4157:       MatSeqAIJCUSPARSECopyToGPU(B);
4158:       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4159:       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4160:       Acsr = (CsrMatrix*)Acusp->mat->mat;
4161:       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4162:       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4163:       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4164:       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4165:       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4166:       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4167:       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4168:       auto pmid = Ccusp->cooPerm->begin();
4169:       thrust::advance(pmid,Acsr->num_entries);
4170:       PetscLogGpuTimeBegin();
4171:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4172:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4173:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4174:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4175:       thrust::for_each(zibait,zieait,VecCUDAEquals());
4176:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4177:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4178:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4179:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4180:       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4181:       MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4182:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4183:         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4184:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4185:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4186:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4187:         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4188:         auto vT = CcsrT->values->begin();
4189:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4190:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4191:         (*C)->transupdated = PETSC_TRUE;
4192:       }
4193:       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4194:       PetscLogGpuTimeEnd();
4195:     }
4196:   }
4197:   PetscObjectStateIncrease((PetscObject)*C);
4198:   (*C)->assembled     = PETSC_TRUE;
4199:   (*C)->was_assembled = PETSC_FALSE;
4200:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4201:   return(0);
4202: }

4204: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4205: {
4206:   PetscErrorCode    ierr;
4207:   bool              dmem;
4208:   const PetscScalar *av;
4209:   cudaError_t       cerr;

4212:   dmem = isCudaMem(v);
4213:   MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4214:   if (n && idx) {
4215:     THRUSTINTARRAY widx(n);
4216:     widx.assign(idx,idx+n);
4217:     PetscLogCpuToGpu(n*sizeof(PetscInt));

4219:     THRUSTARRAY *w = NULL;
4220:     thrust::device_ptr<PetscScalar> dv;
4221:     if (dmem) {
4222:       dv = thrust::device_pointer_cast(v);
4223:     } else {
4224:       w = new THRUSTARRAY(n);
4225:       dv = w->data();
4226:     }
4227:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4229:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4230:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4231:     thrust::for_each(zibit,zieit,VecCUDAEquals());
4232:     if (w) {
4233:       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4234:     }
4235:     delete w;
4236:   } else {
4237:     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4238:   }
4239:   if (!dmem) { PetscLogCpuToGpu(n*sizeof(PetscScalar)); }
4240:   MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4241:   return(0);
4242: }

4244: /*
4245:   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)

4247:   requires:
4248:      structurally symmetric: fix with transpose/column meta data
4249: */

4251: /*
4252:   The GPU LU factor kernel
4253: */
4254: __global__
4255: void __launch_bounds__(1024,1)
4256: mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
4257: {
4258:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4259:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4260:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

4262:   // set i (row+1)
4263:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
4264:   // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block
4265:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4266:     if (rowb < end_i && threadIdx.x==0) {
4267:       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
4268:       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
4269:     }
4270:   }
4271: }
4272: // copy AIJ to AIJ_BAND
4273: __global__
4274: void __launch_bounds__(1024,1)
4275: mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
4276:                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
4277:                                 const int bi_csr[], PetscScalar ba_csr[])
4278: {
4279:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4280:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4281:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

4283:   // zero B
4284:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
4285:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4286:     if (rowb < end_i) {
4287:       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
4288:       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
4289:       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
4290:         if (j<nzb) {
4291:           batmp[j] = 0;
4292:         }
4293:       }
4294:     }
4295:   }

4297:   // copy A into B with CSR format -- these two loops can be fused
4298:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4299:     if (rowb < end_i) {
4300:       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
4301:       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
4302:       const PetscScalar *av    = aa_d + ai_d[rowa];
4303:       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
4304:       /* load in initial (unfactored row) */
4305:       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
4306:         if (j<nza) {
4307:           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
4308:           PetscScalar vala = av[j];
4309:           batmp[idx] = vala;
4310:         }
4311:       }
4312:     }
4313:   }
4314: }
4315: // print AIJ_BAND
4316: __global__
4317: void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
4318: {
4319:   // debug
4320:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
4321:     printf("B (AIJ) n=%d:\n",(int)n);
4322:     for (int rowb=0;rowb<n;rowb++) {
4323:       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
4324:       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
4325:       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
4326:       printf(" bi=%d\n",bi_csr[rowb+1]);
4327:     }
4328:   }
4329: }
4330: // Band LU kernel ---  ba_csr bi_csr
4331: __global__
4332: void __launch_bounds__(1024,1)
4333: mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[])
4334: {
4335:   extern __shared__ PetscInt smemInt[];
4336:   PetscInt        *sm_pkIdx  = &smemInt[0];
4337:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4338:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4339:   const PetscInt  start = field*nloc, end = start + nloc;
4340: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4341:   auto g = cooperative_groups::this_grid();
4342: #endif
4343:   // A22 panel update for each row A(1,:) and col A(:,1)
4344:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4345:     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
4346:     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
4347:     const PetscInt    nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y));
4348:     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
4349:     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
4350:     const PetscScalar Bdd = *pBdd;
4351:     const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
4352:     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */
4353:       if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */
4354:         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
4355:         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
4356:         *Aid = *Aid/Bdd;
4357:         sm_pkIdx[threadIdx.y] = kIdx;
4358:       }
4359:       __syncthreads(); // synch on threadIdx.x only
4360:       if (idx < nzUd) { /* assuming symmetric structure */
4361:         PetscInt    kIdx = sm_pkIdx[threadIdx.y];
4362:         PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4363:         PetscScalar *Aij =  Aid + 1;
4364:         PetscScalar Lid  = *Aid;
4365:         for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) {
4366:           if (jIdx<nzUd) {
4367:             Aij[jIdx] -= Lid*baUd[jIdx];
4368:           }
4369:         }
4370:       }
4371:     }
4372: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4373:     g.sync();
4374: #else
4375:     __syncthreads();
4376: #endif
4377:   } /* endof for (i=0; i<n; i++) { */
4378: }

4380: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
4381: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
4382: {
4383:   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
4384:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4385:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
4386:   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
4387:   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
4388:   CsrMatrix                    *matrixA;
4389:   PetscErrorCode               ierr;
4390:   cudaError_t                  cerr;
4391:   const PetscInt               n=A->rmap->n, *ic, *r;
4392:   const int                    *ai_d, *aj_d;
4393:   const PetscScalar            *aa_d;
4394:   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
4395:   int                          *bi_t = cusparseTriFactors->i_band_d;
4396:   PetscContainer               container;
4397:   int                          Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;

4400:   if (A->rmap->n == 0) {
4401:     return(0);
4402:   }
4403:   // cusparse setup
4404:   if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
4405:   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
4406:   if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
4407:   matrixA = (CsrMatrix*)matstructA->mat;
4408:   if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");

4410:   // factor: get Nf if available
4411:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4412:   if (container) {
4413:     PetscInt *pNf=NULL;
4414:     PetscContainerGetPointer(container, (void **) &pNf);
4415:     Nf = (*pNf)%1000;
4416:     if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
4417:   } else Nf = 1;
4418:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

4420:   // get data
4421:   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
4422:   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4423:   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
4424:   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
4425:   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());

4427:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4428:   PetscLogGpuTimeBegin();
4429:   {
4430:     int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
4431:     int gpuid;
4432:     cudaDeviceProp prop;
4433:     cudaGetDevice(&gpuid);
4434:     cudaGetDeviceProperties(&prop, gpuid);
4435: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
4436:     Ni = 1/nconcurrent;
4437:     Ni = 1;
4438: #else
4439:     nsm = prop.multiProcessorCount;
4440:     Ni = nsm/Nf/nconcurrent;
4441: #endif
4442:     team_size = bw/Ni + !!(bw%Ni);
4443:     nVec = PetscMin(bw, 1024/team_size);
4444:     PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);
4445:     {
4446:       dim3 dimBlockTeam(nVec,team_size);
4447:       dim3 dimBlockLeague(Nf,Ni);
4448:       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
4449:       CHECK_LAUNCH_ERROR(); // does a sync
4450: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4451:       void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t};
4452:       cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL);
4453: #else
4454:       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t);
4455: #endif
4456:       CHECK_LAUNCH_ERROR(); // does a sync
4457: #if defined(PETSC_USE_LOG)
4458:       PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));
4459: #endif
4460:     }
4461:   }
4462:   PetscLogGpuTimeEnd();

4464:   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
4465:   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
4466:   B->ops->solvetranspose = NULL; // need transpose
4467:   B->ops->matsolve = NULL;
4468:   B->ops->matsolvetranspose = NULL;

4470:   return(0);
4471: }

4473: static PetscErrorCode MatrixNfDestroy(void *ptr)
4474: {
4475:   PetscInt *nf = (PetscInt *)ptr;
4476:   PetscErrorCode  ierr;
4478:   PetscFree(nf);
4479:   return(0);
4480: }

4482: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4483: {
4484:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
4485:   IS                 isicol;
4486:   PetscErrorCode     ierr;
4487:   cudaError_t        cerr;
4488:   const PetscInt     *ic,*ai=a->i,*aj=a->j;
4489:   PetscScalar        *ba_t;
4490:   int                *bi_t;
4491:   PetscInt           i,n=A->rmap->n,Nf;
4492:   PetscInt           nzBcsr,bwL,bwU;
4493:   PetscBool          missing;
4494:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4495:   PetscContainer               container;

4498:   if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
4499:   MatMissingDiagonal(A,&missing,&i);
4500:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
4501:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
4502:   MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);
4503:   if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");

4505:    // factor: get Nf if available
4506:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4507:   if (container) {
4508:     PetscInt *pNf=NULL;
4509:     PetscContainerGetPointer(container, (void **) &pNf);
4510:     Nf = (*pNf)%1000;
4511:     PetscContainerCreate(PETSC_COMM_SELF, &container);
4512:     PetscMalloc(sizeof(PetscInt), &pNf);
4513:     *pNf = Nf;
4514:     PetscContainerSetPointer(container, (void *)pNf);
4515:     PetscContainerSetUserDestroy(container, MatrixNfDestroy);
4516:     PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);
4517:     PetscContainerDestroy(&container);
4518:   } else Nf = 1;
4519:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

4521:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
4522:   ISGetIndices(isicol,&ic);

4524:   MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
4525:   PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);
4526:   b    = (Mat_SeqAIJ*)(B)->data;

4528:   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
4529:   bwL = bwU = 0;
4530:   for (int rwb=0; rwb<n; rwb++) {
4531:     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
4532:     for (int j=0;j<anz;j++) {
4533:       PetscInt colb = ic[ajtmp[j]];
4534:       if (colb<rwa) { // L
4535:         if (rwa-colb > bwL) bwL = rwa-colb;
4536:       } else {
4537:         if (colb-rwa > bwU) bwU = colb-rwa;
4538:       }
4539:     }
4540:   }
4541:   ISRestoreIndices(isicol,&ic);
4542:   /* only support structurally symmetric, but it might work */
4543:   if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
4544:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
4545:   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
4546:   b->maxnz = b->nz = nzBcsr;
4547:   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
4548:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
4549:   cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
4550:   cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
4551:   cusparseTriFactors->a_band_d = ba_t;
4552:   cusparseTriFactors->i_band_d = bi_t;
4553:   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
4554:   PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));
4555:   {
4556:     dim3 dimBlockTeam(1,128);
4557:     dim3 dimBlockLeague(Nf,1);
4558:     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
4559:   }
4560:   CHECK_LAUNCH_ERROR(); // does a sync

4562:   // setup data
4563:   if (!cusparseTriFactors->rpermIndices) {
4564:     const PetscInt *r;

4566:     ISGetIndices(isrow,&r);
4567:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
4568:     cusparseTriFactors->rpermIndices->assign(r, r+n);
4569:     ISRestoreIndices(isrow,&r);
4570:     PetscLogCpuToGpu(n*sizeof(PetscInt));
4571:   }
4572:   /* upper triangular indices */
4573:   if (!cusparseTriFactors->cpermIndices) {
4574:     const PetscInt *c;

4576:     ISGetIndices(isicol,&c);
4577:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
4578:     cusparseTriFactors->cpermIndices->assign(c, c+n);
4579:     ISRestoreIndices(isicol,&c);
4580:     PetscLogCpuToGpu(n*sizeof(PetscInt));
4581:   }

4583:   /* put together the new matrix */
4584:   b->free_a       = PETSC_FALSE;
4585:   b->free_ij      = PETSC_FALSE;
4586:   b->singlemalloc = PETSC_FALSE;
4587:   b->ilen = NULL;
4588:   b->imax = NULL;
4589:   b->row  = isrow;
4590:   b->col  = iscol;
4591:   PetscObjectReference((PetscObject)isrow);
4592:   PetscObjectReference((PetscObject)iscol);
4593:   b->icol = isicol;
4594:   PetscMalloc1(n+1,&b->solve_work);

4596:   B->factortype            = MAT_FACTOR_LU;
4597:   B->info.factor_mallocs   = 0;
4598:   B->info.fill_ratio_given = 0;

4600:   if (ai[n]) {
4601:     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
4602:   } else {
4603:     B->info.fill_ratio_needed = 0.0;
4604:   }
4605: #if defined(PETSC_USE_INFO)
4606:   if (ai[n] != 0) {
4607:     PetscReal af = B->info.fill_ratio_needed;
4608:     PetscInfo1(A,"Band fill ratio %g\n",(double)af);
4609:   } else {
4610:     PetscInfo(A,"Empty matrix\n");
4611:   }
4612: #endif
4613:   if (a->inode.size) {
4614:     PetscInfo(A,"Warning: using inodes in band solver.\n");
4615:   }
4616:   MatSeqAIJCheckInode_FactorLU(B);
4617:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
4618:   B->offloadmask = PETSC_OFFLOAD_GPU;

4620:   return(0);
4621: }

4623: /* Use -pc_factor_mat_solver_type cusparseband */
4624: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
4625: {
4627:   *type = MATSOLVERCUSPARSEBAND;
4628:   return(0);
4629: }

4631: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
4632: {
4634:   PetscInt       n = A->rmap->n;

4637:   MatCreate(PetscObjectComm((PetscObject)A),B);
4638:   MatSetSizes(*B,n,n,n,n);
4639:   (*B)->factortype = ftype;
4640:   (*B)->useordering = PETSC_TRUE;
4641:   MatSetType(*B,MATSEQAIJCUSPARSE);

4643:   if (ftype == MAT_FACTOR_LU) {
4644:     MatSetBlockSizesFromMats(*B,A,A);
4645:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
4646:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
4647:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");

4649:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
4650:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);
4651:   return(0);
4652: }

4654: #define WARP_SIZE 32
4655: template <typename T>
4656: __forceinline__ __device__
4657: T wreduce(T a)
4658: {
4659:   T b;
4660:   #pragma unroll
4661:   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
4662:     b = __shfl_down_sync(0xffffffff, a, i);
4663:     a += b;
4664:   }
4665:   return a;
4666: }
4667: // reduce in a block, returns result in thread 0
4668: template <typename T, int BLOCK_SIZE>
4669: __device__
4670: T breduce(T a)
4671: {
4672:   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
4673:   __shared__ double buf[NWARP];
4674:   int wid = threadIdx.x / WARP_SIZE;
4675:   int laneid = threadIdx.x % WARP_SIZE;
4676:   T b = wreduce<T>(a);
4677:   if (laneid == 0)
4678:     buf[wid] = b;
4679:   __syncthreads();
4680:   if (wid == 0) {
4681:     if (threadIdx.x < NWARP)
4682:       a = buf[threadIdx.x];
4683:     else
4684:       a = 0;
4685:     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
4686:       a += __shfl_down_sync(0xffffffff, a, i);
4687:     }
4688:   }
4689:   return a;
4690: }


4693: // Band LU kernel ---  ba_csr bi_csr
4694: template <int BLOCK_SIZE>
4695: __global__
4696: void __launch_bounds__(256,1)
4697: mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
4698: {
4699:   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
4700:   const PetscScalar *pLi;
4701:   const int tid = threadIdx.x;

4703:   /* Next, solve L */
4704:   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
4705:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4706:     const PetscInt col = locDD<bw ? start : (glbDD-bw);
4707:     PetscScalar t = 0;
4708:     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
4709:       t += pLi[idx]*x[j];
4710:     }
4711: #if defined(PETSC_USE_COMPLEX)
4712:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4713:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4714:     t = tt;
4715: #else
4716:     t = breduce<PetscReal,BLOCK_SIZE>(t);
4717: #endif
4718:     if (threadIdx.x == 0)
4719:       x[glbDD] -= t; // /1.0
4720:     __syncthreads();
4721:     // inc
4722:     pLi += glbDD-col; // get to diagonal
4723:     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
4724:     else pLi += bw;
4725:     pLi += 1; // skip to next row
4726:     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
4727:   }
4728:   /* Then, solve U */
4729:   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
4730:   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
4731:   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
4732:     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
4733:     PetscScalar t = 0;
4734:     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
4735:       t += pLi[-idx]*x[j];
4736:     }
4737: #if defined(PETSC_USE_COMPLEX)
4738:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4739:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4740:     t = tt;
4741: #else
4742:     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
4743: #endif
4744:     pLi -= col-glbDD; // diagonal
4745:     if (threadIdx.x == 0) {
4746:       x[glbDD] -= t;
4747:       x[glbDD] /= pLi[0];
4748:     }
4749:     __syncthreads();
4750:     // inc past L to start of previous U
4751:     pLi -= bw+1;
4752:     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
4753:     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
4754:   }
4755: }

4757: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
4758: {
4759:   const PetscScalar                     *barray;
4760:   PetscScalar                           *xarray;
4761:   thrust::device_ptr<const PetscScalar> bGPU;
4762:   thrust::device_ptr<PetscScalar>       xGPU;
4763:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4764:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
4765:   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf;
4766:   PetscErrorCode                        ierr;
4767:   cudaError_t                           cerr;
4768:   PetscContainer                        container;

4771:   if (A->rmap->n == 0) {
4772:     return(0);
4773:   }
4774:   // factor: get Nf if available
4775:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4776:   if (container) {
4777:     PetscInt *pNf=NULL;
4778:     PetscContainerGetPointer(container, (void **) &pNf);
4779:     Nf = (*pNf)%1000;
4780:   } else Nf = 1;
4781:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

4783:   /* Get the GPU pointers */
4784:   VecCUDAGetArrayWrite(xx,&xarray);
4785:   VecCUDAGetArrayRead(bb,&barray);
4786:   xGPU = thrust::device_pointer_cast(xarray);
4787:   bGPU = thrust::device_pointer_cast(barray);

4789:   PetscLogGpuTimeBegin();
4790:   /* First, reorder with the row permutation */
4791:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
4792:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
4793:                tempGPU->begin());
4794:   constexpr int block = 128;
4795:   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
4796:   CHECK_LAUNCH_ERROR(); // does a sync

4798:   /* Last, reorder with the column permutation */
4799:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
4800:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
4801:                xGPU);

4803:   VecCUDARestoreArrayRead(bb,&barray);
4804:   VecCUDARestoreArrayWrite(xx,&xarray);
4805:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4806:   PetscLogGpuTimeEnd();
4807:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
4808:   return(0);
4809: }