Actual source code: aijcusparse.cu

petsc-master 2020-12-02
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10: #include <../src/mat/impls/aij/seq/aij.h>
 11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12: #include <../src/vec/vec/impls/dvecimpl.h>
 13: #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 17: const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
 18: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 19:   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 20:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 22:   typedef enum {
 23:       CUSPARSE_MV_ALG_DEFAULT = 0,
 24:       CUSPARSE_COOMV_ALG      = 1,
 25:       CUSPARSE_CSRMV_ALG1     = 2,
 26:       CUSPARSE_CSRMV_ALG2     = 3
 27:   } cusparseSpMVAlg_t;

 29:   typedef enum {
 30:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 31:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 32:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 33:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 34:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 35:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 36:       CUSPARSE_SPMM_COO_ALG1    = 1,
 37:       CUSPARSE_SPMM_COO_ALG2    = 2,
 38:       CUSPARSE_SPMM_COO_ALG3    = 3,
 39:       CUSPARSE_SPMM_COO_ALG4    = 5,
 40:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 41:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 42:   } cusparseSpMMAlg_t;

 44:   typedef enum {
 45:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 46:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 47:   } cusparseCsr2CscAlg_t;
 48:   */
 49:   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
 50:   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
 51:   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
 52: #endif

 54: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 55: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 56: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 58: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 59: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 60: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 62: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 64: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 66: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 67: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 68: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 69: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 70: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 71: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 72: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 73: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 75: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 76: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 77: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 78: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
 79: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 80: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 82: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
 83: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);

 85: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 86: {
 87:   cusparseStatus_t   stat;
 88:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 91:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
 92:   cusparsestruct->stream = stream;
 93:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
 94:   return(0);
 95: }

 97: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 98: {
 99:   cusparseStatus_t   stat;
100:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

103:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
104:   if (cusparsestruct->handle != handle) {
105:     if (cusparsestruct->handle) {
106:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
107:     }
108:     cusparsestruct->handle = handle;
109:   }
110:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
111:   return(0);
112: }

114: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
115: {
116:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
117:   PetscBool          flg;
118:   PetscErrorCode     ierr;

121:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
122:   if (!flg || !cusparsestruct) return(0);
123:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
124:   return(0);
125: }

127: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
128: {
130:   *type = MATSOLVERCUSPARSE;
131:   return(0);
132: }

134: /*MC
135:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
136:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
137:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
138:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
139:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
140:   algorithms are not recommended. This class does NOT support direct solver operations.

142:   Level: beginner

144: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
145: M*/

147: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
148: {
150:   PetscInt       n = A->rmap->n;

153:   MatCreate(PetscObjectComm((PetscObject)A),B);
154:   MatSetSizes(*B,n,n,n,n);
155:   (*B)->factortype = ftype;
156:   (*B)->useordering = PETSC_TRUE;
157:   MatSetType(*B,MATSEQAIJCUSPARSE);

159:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
160:     MatSetBlockSizesFromMats(*B,A,A);
161:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
162:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
163:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
164:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
165:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
166:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

168:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
169:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
170:   return(0);
171: }

173: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174: {
175:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

178:   switch (op) {
179:   case MAT_CUSPARSE_MULT:
180:     cusparsestruct->format = format;
181:     break;
182:   case MAT_CUSPARSE_ALL:
183:     cusparsestruct->format = format;
184:     break;
185:   default:
186:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
187:   }
188:   return(0);
189: }

191: /*@
192:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
193:    operation. Only the MatMult operation can use different GPU storage formats
194:    for MPIAIJCUSPARSE matrices.
195:    Not Collective

197:    Input Parameters:
198: +  A - Matrix of type SEQAIJCUSPARSE
199: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
200: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

202:    Output Parameter:

204:    Level: intermediate

206: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
207: @*/
208: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
209: {

214:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
215:   return(0);
216: }

218: /*@
219:    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose

221:    Collective on mat

223:    Input Parameters:
224: +  A - Matrix of type SEQAIJCUSPARSE
225: -  transgen - the boolean flag

227:    Level: intermediate

229: .seealso: MATSEQAIJCUSPARSE
230: @*/
231: PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
232: {
234:   PetscBool      flg;

238:   PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);
239:   if (flg) {
240:     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;

242:     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
243:     cusp->transgen = transgen;
244:     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
245:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
246:     }
247:   }
248:   return(0);
249: }

251: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
252: {
253:   PetscErrorCode           ierr;
254:   MatCUSPARSEStorageFormat format;
255:   PetscBool                flg;
256:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

259:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
260:   if (A->factortype == MAT_FACTOR_NONE) {
261:     PetscBool transgen = cusparsestruct->transgen;

263:     PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);
264:     if (flg) {MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);}

266:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
267:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
268:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}

270:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
271:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
272:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
273:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
274:     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
275:     PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
276:                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
277:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
278:     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");

280:     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
281:     PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
282:                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
283:     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

285:     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
286:     PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
287:                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
288:     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
289:    #endif
290:   }
291:   PetscOptionsTail();
292:   return(0);
293: }

295: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
296: {
297:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
298:   PetscErrorCode               ierr;

301:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
302:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
303:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
304:   return(0);
305: }

307: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
308: {
309:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
310:   PetscErrorCode               ierr;

313:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
314:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
315:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
316:   return(0);
317: }

319: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
320: {
321:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
322:   PetscErrorCode               ierr;

325:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
326:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
327:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
328:   return(0);
329: }

331: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
332: {
333:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
334:   PetscErrorCode               ierr;

337:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
338:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
339:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
340:   return(0);
341: }

343: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
344: {
345:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
346:   PetscInt                          n = A->rmap->n;
347:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
348:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
349:   cusparseStatus_t                  stat;
350:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
351:   const MatScalar                   *aa = a->a,*v;
352:   PetscInt                          *AiLo, *AjLo;
353:   PetscScalar                       *AALo;
354:   PetscInt                          i,nz, nzLower, offset, rowOffset;
355:   PetscErrorCode                    ierr;
356:   cudaError_t                       cerr;

359:   if (!n) return(0);
360:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
361:     try {
362:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
363:       nzLower=n+ai[n]-ai[1];
364:       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
365:       if (!loTriFactor) {

367:         /* Allocate Space for the lower triangular matrix */
368:         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
369:         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);

371:         /* Fill the lower triangular matrix */
372:         AiLo[0]  = (PetscInt) 0;
373:         AiLo[n]  = nzLower;
374:         AjLo[0]  = (PetscInt) 0;
375:         AALo[0]  = (MatScalar) 1.0;
376:         v        = aa;
377:         vi       = aj;
378:         offset   = 1;
379:         rowOffset= 1;
380:         for (i=1; i<n; i++) {
381:           nz = ai[i+1] - ai[i];
382:           /* additional 1 for the term on the diagonal */
383:           AiLo[i]    = rowOffset;
384:           rowOffset += nz+1;

386:           PetscArraycpy(&(AjLo[offset]), vi, nz);
387:           PetscArraycpy(&(AALo[offset]), v, nz);

389:           offset      += nz;
390:           AjLo[offset] = (PetscInt) i;
391:           AALo[offset] = (MatScalar) 1.0;
392:           offset      += 1;

394:           v  += nz;
395:           vi += nz;
396:         }

398:         /* allocate space for the triangular factor information */
399:         PetscNew(&loTriFactor);
400:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
401:         /* Create the matrix description */
402:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
403:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
404:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
405:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
406:        #else
407:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
408:        #endif
409:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
410:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

412:         /* set the operation */
413:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

415:         /* set the matrix */
416:         loTriFactor->csrMat = new CsrMatrix;
417:         loTriFactor->csrMat->num_rows = n;
418:         loTriFactor->csrMat->num_cols = n;
419:         loTriFactor->csrMat->num_entries = nzLower;

421:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
422:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

424:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
425:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

427:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
428:         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

430:         /* Create the solve analysis information */
431:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
432:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
433:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
434:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
435:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
436:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
437:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
438:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
439:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
440:       #endif

442:         /* perform the solve analysis */
443:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
444:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
445:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
446:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
447:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
448:                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
449:                                #endif
450:                                 );CHKERRCUSPARSE(stat);
451:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
452:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

454:         /* assign the pointer */
455:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

457:         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
458:         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
459:         PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
460:       } else { /* update values only */
461:         /* Fill the lower triangular matrix */
462:         AALo[0]  = 1.0;
463:         v        = aa;
464:         vi       = aj;
465:         offset   = 1;
466:         for (i=1; i<n; i++) {
467:           nz = ai[i+1] - ai[i];
468:           PetscArraycpy(&(AALo[offset]), v, nz);
469:           offset      += nz;
470:           AALo[offset] = 1.0;
471:           offset      += 1;
472:           v  += nz;
473:         }
474:         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
475:         PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
476:       }
477:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
478:     } catch(char *ex) {
479:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
480:     }
481:   }
482:   return(0);
483: }

485: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
486: {
487:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
488:   PetscInt                          n = A->rmap->n;
489:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
491:   cusparseStatus_t                  stat;
492:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
493:   const MatScalar                   *aa = a->a,*v;
494:   PetscInt                          *AiUp, *AjUp;
495:   PetscScalar                       *AAUp;
496:   PetscInt                          i,nz, nzUpper, offset;
497:   PetscErrorCode                    ierr;
498:   cudaError_t                       cerr;

501:   if (!n) return(0);
502:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
503:     try {
504:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
505:       nzUpper = adiag[0]-adiag[n];
506:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
507:       if (!upTriFactor) {
508:         /* Allocate Space for the upper triangular matrix */
509:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
510:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

512:         /* Fill the upper triangular matrix */
513:         AiUp[0]=(PetscInt) 0;
514:         AiUp[n]=nzUpper;
515:         offset = nzUpper;
516:         for (i=n-1; i>=0; i--) {
517:           v  = aa + adiag[i+1] + 1;
518:           vi = aj + adiag[i+1] + 1;

520:           /* number of elements NOT on the diagonal */
521:           nz = adiag[i] - adiag[i+1]-1;

523:           /* decrement the offset */
524:           offset -= (nz+1);

526:           /* first, set the diagonal elements */
527:           AjUp[offset] = (PetscInt) i;
528:           AAUp[offset] = (MatScalar)1./v[nz];
529:           AiUp[i]      = AiUp[i+1] - (nz+1);

531:           PetscArraycpy(&(AjUp[offset+1]), vi, nz);
532:           PetscArraycpy(&(AAUp[offset+1]), v, nz);
533:         }

535:         /* allocate space for the triangular factor information */
536:         PetscNew(&upTriFactor);
537:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

539:         /* Create the matrix description */
540:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
541:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
542:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
543:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
544:        #else
545:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
546:        #endif
547:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
548:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

550:         /* set the operation */
551:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

553:         /* set the matrix */
554:         upTriFactor->csrMat = new CsrMatrix;
555:         upTriFactor->csrMat->num_rows = n;
556:         upTriFactor->csrMat->num_cols = n;
557:         upTriFactor->csrMat->num_entries = nzUpper;

559:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
560:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

562:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
563:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

565:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
566:         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

568:         /* Create the solve analysis information */
569:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
570:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
571:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
572:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
573:                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574:                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575:                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
576:                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
577:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
578:       #endif

580:         /* perform the solve analysis */
581:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
582:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
583:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
584:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
585:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
586:                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
587:                                #endif
588:                                 );CHKERRCUSPARSE(stat);
589:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
590:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

592:         /* assign the pointer */
593:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

595:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
596:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
597:         PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
598:       } else {
599:         /* Fill the upper triangular matrix */
600:         offset = nzUpper;
601:         for (i=n-1; i>=0; i--) {
602:           v  = aa + adiag[i+1] + 1;

604:           /* number of elements NOT on the diagonal */
605:           nz = adiag[i] - adiag[i+1]-1;

607:           /* decrement the offset */
608:           offset -= (nz+1);

610:           /* first, set the diagonal elements */
611:           AAUp[offset] = 1./v[nz];
612:           PetscArraycpy(&(AAUp[offset+1]), v, nz);
613:         }
614:         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
615:         PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
616:       }
617:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
618:     } catch(char *ex) {
619:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
620:     }
621:   }
622:   return(0);
623: }

625: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
626: {
627:   PetscErrorCode               ierr;
628:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
629:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
630:   IS                           isrow = a->row,iscol = a->icol;
631:   PetscBool                    row_identity,col_identity;
632:   PetscInt                     n = A->rmap->n;

635:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
636:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
637:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

639:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
640:   cusparseTriFactors->nnz=a->nz;

642:   A->offloadmask = PETSC_OFFLOAD_BOTH;
643:   /* lower triangular indices */
644:   ISIdentity(isrow,&row_identity);
645:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
646:     const PetscInt *r;

648:     ISGetIndices(isrow,&r);
649:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
650:     cusparseTriFactors->rpermIndices->assign(r, r+n);
651:     ISRestoreIndices(isrow,&r);
652:     PetscLogCpuToGpu(n*sizeof(PetscInt));
653:   }

655:   /* upper triangular indices */
656:   ISIdentity(iscol,&col_identity);
657:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
658:     const PetscInt *c;

660:     ISGetIndices(iscol,&c);
661:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
662:     cusparseTriFactors->cpermIndices->assign(c, c+n);
663:     ISRestoreIndices(iscol,&c);
664:     PetscLogCpuToGpu(n*sizeof(PetscInt));
665:   }
666:   return(0);
667: }

669: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
670: {
671:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
672:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
673:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
674:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
675:   cusparseStatus_t                  stat;
676:   PetscErrorCode                    ierr;
677:   cudaError_t                       cerr;
678:   PetscInt                          *AiUp, *AjUp;
679:   PetscScalar                       *AAUp;
680:   PetscScalar                       *AALo;
681:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
682:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
683:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
684:   const MatScalar                   *aa = b->a,*v;

687:   if (!n) return(0);
688:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
689:     try {
690:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
691:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
692:       if (!upTriFactor && !loTriFactor) {
693:         /* Allocate Space for the upper triangular matrix */
694:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
695:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

697:         /* Fill the upper triangular matrix */
698:         AiUp[0]=(PetscInt) 0;
699:         AiUp[n]=nzUpper;
700:         offset = 0;
701:         for (i=0; i<n; i++) {
702:           /* set the pointers */
703:           v  = aa + ai[i];
704:           vj = aj + ai[i];
705:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

707:           /* first, set the diagonal elements */
708:           AjUp[offset] = (PetscInt) i;
709:           AAUp[offset] = (MatScalar)1.0/v[nz];
710:           AiUp[i]      = offset;
711:           AALo[offset] = (MatScalar)1.0/v[nz];

713:           offset+=1;
714:           if (nz>0) {
715:             PetscArraycpy(&(AjUp[offset]), vj, nz);
716:             PetscArraycpy(&(AAUp[offset]), v, nz);
717:             for (j=offset; j<offset+nz; j++) {
718:               AAUp[j] = -AAUp[j];
719:               AALo[j] = AAUp[j]/v[nz];
720:             }
721:             offset+=nz;
722:           }
723:         }

725:         /* allocate space for the triangular factor information */
726:         PetscNew(&upTriFactor);
727:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

729:         /* Create the matrix description */
730:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
731:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
732:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
733:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
734:        #else
735:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
736:        #endif
737:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
738:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

740:         /* set the matrix */
741:         upTriFactor->csrMat = new CsrMatrix;
742:         upTriFactor->csrMat->num_rows = A->rmap->n;
743:         upTriFactor->csrMat->num_cols = A->cmap->n;
744:         upTriFactor->csrMat->num_entries = a->nz;

746:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
747:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

749:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
750:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

752:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
753:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

755:         /* set the operation */
756:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

758:         /* Create the solve analysis information */
759:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
760:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
761:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
762:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
763:                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
764:                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
765:                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
766:                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
767:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
768:       #endif

770:         /* perform the solve analysis */
771:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
772:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
773:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
774:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
775:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
776:                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
777:                                 #endif
778:                                 );CHKERRCUSPARSE(stat);
779:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
780:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

782:         /* assign the pointer */
783:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

785:         /* allocate space for the triangular factor information */
786:         PetscNew(&loTriFactor);
787:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

789:         /* Create the matrix description */
790:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
791:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
792:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
793:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
794:        #else
795:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
796:        #endif
797:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
798:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

800:         /* set the operation */
801:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

803:         /* set the matrix */
804:         loTriFactor->csrMat = new CsrMatrix;
805:         loTriFactor->csrMat->num_rows = A->rmap->n;
806:         loTriFactor->csrMat->num_cols = A->cmap->n;
807:         loTriFactor->csrMat->num_entries = a->nz;

809:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
810:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

812:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
813:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

815:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
816:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

818:         /* Create the solve analysis information */
819:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
820:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
821:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
822:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
823:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
824:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
825:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
826:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
827:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
828:       #endif

830:         /* perform the solve analysis */
831:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
832:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
833:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
834:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
835:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
836:                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
837:                                 #endif
838:                                 );CHKERRCUSPARSE(stat);
839:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
840:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

842:         /* assign the pointer */
843:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

845:         PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
846:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
847:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
848:       } else {
849:         /* Fill the upper triangular matrix */
850:         offset = 0;
851:         for (i=0; i<n; i++) {
852:           /* set the pointers */
853:           v  = aa + ai[i];
854:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

856:           /* first, set the diagonal elements */
857:           AAUp[offset] = 1.0/v[nz];
858:           AALo[offset] = 1.0/v[nz];

860:           offset+=1;
861:           if (nz>0) {
862:             PetscArraycpy(&(AAUp[offset]), v, nz);
863:             for (j=offset; j<offset+nz; j++) {
864:               AAUp[j] = -AAUp[j];
865:               AALo[j] = AAUp[j]/v[nz];
866:             }
867:             offset+=nz;
868:           }
869:         }
870:         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
871:         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
872:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
873:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
874:         PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
875:       }
876:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
877:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
878:     } catch(char *ex) {
879:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
880:     }
881:   }
882:   return(0);
883: }

885: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
886: {
887:   PetscErrorCode               ierr;
888:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
889:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
890:   IS                           ip = a->row;
891:   PetscBool                    perm_identity;
892:   PetscInt                     n = A->rmap->n;

895:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
896:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
897:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
898:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

900:   A->offloadmask = PETSC_OFFLOAD_BOTH;

902:   /* lower triangular indices */
903:   ISIdentity(ip,&perm_identity);
904:   if (!perm_identity) {
905:     IS             iip;
906:     const PetscInt *irip,*rip;

908:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
909:     ISGetIndices(iip,&irip);
910:     ISGetIndices(ip,&rip);
911:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
912:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
913:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
914:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
915:     ISRestoreIndices(iip,&irip);
916:     ISDestroy(&iip);
917:     ISRestoreIndices(ip,&rip);
918:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
919:   }
920:   return(0);
921: }

923: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
924: {
925:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
926:   IS             isrow = b->row,iscol = b->col;
927:   PetscBool      row_identity,col_identity;

931:   MatLUFactorNumeric_SeqAIJ(B,A,info);
932:   B->offloadmask = PETSC_OFFLOAD_CPU;
933:   /* determine which version of MatSolve needs to be used. */
934:   ISIdentity(isrow,&row_identity);
935:   ISIdentity(iscol,&col_identity);
936:   if (row_identity && col_identity) {
937:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
938:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
939:     B->ops->matsolve = NULL;
940:     B->ops->matsolvetranspose = NULL;
941:   } else {
942:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
943:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
944:     B->ops->matsolve = NULL;
945:     B->ops->matsolvetranspose = NULL;
946:   }

948:   /* get the triangular factors */
949:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
950:   return(0);
951: }

953: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
954: {
955:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
956:   IS             ip = b->row;
957:   PetscBool      perm_identity;

961:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
962:   B->offloadmask = PETSC_OFFLOAD_CPU;
963:   /* determine which version of MatSolve needs to be used. */
964:   ISIdentity(ip,&perm_identity);
965:   if (perm_identity) {
966:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
967:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
968:     B->ops->matsolve = NULL;
969:     B->ops->matsolvetranspose = NULL;
970:   } else {
971:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
972:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
973:     B->ops->matsolve = NULL;
974:     B->ops->matsolvetranspose = NULL;
975:   }

977:   /* get the triangular factors */
978:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
979:   return(0);
980: }

982: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
983: {
984:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
985:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
986:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
987:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
988:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
989:   cusparseStatus_t                  stat;
990:   cusparseIndexBase_t               indexBase;
991:   cusparseMatrixType_t              matrixType;
992:   cusparseFillMode_t                fillMode;
993:   cusparseDiagType_t                diagType;
994:   cudaError_t                       cerr;
995:   PetscErrorCode                    ierr;

998:   /* allocate space for the transpose of the lower triangular factor */
999:   PetscNew(&loTriFactorT);
1000:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1002:   /* set the matrix descriptors of the lower triangular factor */
1003:   matrixType = cusparseGetMatType(loTriFactor->descr);
1004:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1005:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1006:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1007:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

1009:   /* Create the matrix description */
1010:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1011:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1012:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1013:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1014:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1016:   /* set the operation */
1017:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1019:   /* allocate GPU space for the CSC of the lower triangular factor*/
1020:   loTriFactorT->csrMat = new CsrMatrix;
1021:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1022:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1023:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1024:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1025:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1026:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1028:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1029: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1030:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1031:                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1032:                                        loTriFactor->csrMat->values->data().get(),
1033:                                        loTriFactor->csrMat->row_offsets->data().get(),
1034:                                        loTriFactor->csrMat->column_indices->data().get(),
1035:                                        loTriFactorT->csrMat->values->data().get(),
1036:                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1037:                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1038:                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1039:   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1040: #endif

1042:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1043:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1044:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1045:                           loTriFactor->csrMat->values->data().get(),
1046:                           loTriFactor->csrMat->row_offsets->data().get(),
1047:                           loTriFactor->csrMat->column_indices->data().get(),
1048:                           loTriFactorT->csrMat->values->data().get(),
1049:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1050:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1051:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1052:                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1053:                         #else
1054:                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1055:                           CUSPARSE_ACTION_NUMERIC, indexBase
1056:                         #endif
1057:                         );CHKERRCUSPARSE(stat);
1058:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1059:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1061:   /* Create the solve analysis information */
1062:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1063:   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1064: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1065:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1066:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1067:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1069:                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1070:   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1071: #endif

1073:   /* perform the solve analysis */
1074:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1075:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1076:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077:                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1078:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1079:                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1080:                           #endif
1081:                           );CHKERRCUSPARSE(stat);
1082:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1083:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1085:   /* assign the pointer */
1086:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1088:   /*********************************************/
1089:   /* Now the Transpose of the Upper Tri Factor */
1090:   /*********************************************/

1092:   /* allocate space for the transpose of the upper triangular factor */
1093:   PetscNew(&upTriFactorT);
1094:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1096:   /* set the matrix descriptors of the upper triangular factor */
1097:   matrixType = cusparseGetMatType(upTriFactor->descr);
1098:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1099:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1100:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1101:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

1103:   /* Create the matrix description */
1104:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1105:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1106:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1107:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1108:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1110:   /* set the operation */
1111:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1113:   /* allocate GPU space for the CSC of the upper triangular factor*/
1114:   upTriFactorT->csrMat = new CsrMatrix;
1115:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1116:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1117:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1118:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1119:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1120:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1122:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1123: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1124:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1125:                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1126:                                 upTriFactor->csrMat->values->data().get(),
1127:                                 upTriFactor->csrMat->row_offsets->data().get(),
1128:                                 upTriFactor->csrMat->column_indices->data().get(),
1129:                                 upTriFactorT->csrMat->values->data().get(),
1130:                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1131:                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1132:                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1133:   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1134: #endif

1136:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1137:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1138:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1139:                           upTriFactor->csrMat->values->data().get(),
1140:                           upTriFactor->csrMat->row_offsets->data().get(),
1141:                           upTriFactor->csrMat->column_indices->data().get(),
1142:                           upTriFactorT->csrMat->values->data().get(),
1143:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1145:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1146:                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1147:                         #else
1148:                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1149:                           CUSPARSE_ACTION_NUMERIC, indexBase
1150:                         #endif
1151:                         );CHKERRCUSPARSE(stat);
1152:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1153:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1155:   /* Create the solve analysis information */
1156:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1157:   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1158:   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1159:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1160:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1161:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1163:                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1164:   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1165:   #endif

1167:   /* perform the solve analysis */
1168:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171:                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1172:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1173:                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1174:                           #endif
1175:                           );CHKERRCUSPARSE(stat);
1176:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1177:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1179:   /* assign the pointer */
1180:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1181:   return(0);
1182: }

1184: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1185: {
1186:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1187:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1188:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1189:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1190:   cusparseStatus_t             stat;
1191:   cusparseIndexBase_t          indexBase;
1192:   cudaError_t                  err;
1193:   PetscErrorCode               ierr;

1196:   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) return(0);
1197:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1198:   PetscLogGpuTimeBegin();
1199:   /* create cusparse matrix */
1200:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1201:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1202:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1203:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1204:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1206:   /* set alpha and beta */
1207:   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1208:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1209:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1210:   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1211:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1212:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1213:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1215:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1216:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1217:     CsrMatrix *matrixT= new CsrMatrix;
1218:     matrixT->num_rows = A->cmap->n;
1219:     matrixT->num_cols = A->rmap->n;
1220:     matrixT->num_entries = a->nz;
1221:     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1222:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1223:     matrixT->values = new THRUSTARRAY(a->nz);

1225:     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1226:     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);

1228:     /* compute the transpose, i.e. the CSC */
1229:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1230:     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1231:                                   A->cmap->n, matrix->num_entries,
1232:                                   matrix->values->data().get(),
1233:                                   cusparsestruct->rowoffsets_gpu->data().get(),
1234:                                   matrix->column_indices->data().get(),
1235:                                   matrixT->values->data().get(),
1236:                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1237:                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1238:                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1239:     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1240:    #endif

1242:     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1243:                             A->cmap->n, matrix->num_entries,
1244:                             matrix->values->data().get(),
1245:                             cusparsestruct->rowoffsets_gpu->data().get(),
1246:                             matrix->column_indices->data().get(),
1247:                             matrixT->values->data().get(),
1248:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1249:                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1250:                             CUSPARSE_ACTION_NUMERIC,indexBase,
1251:                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1252:                           #else
1253:                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1254:                             CUSPARSE_ACTION_NUMERIC, indexBase
1255:                           #endif
1256:                            );CHKERRCUSPARSE(stat);
1257:     matstructT->mat = matrixT;

1259:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1260:     stat = cusparseCreateCsr(&matstructT->matDescr,
1261:                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262:                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263:                              matrixT->values->data().get(),
1264:                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265:                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266:    #endif
1267:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1268:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1269:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1270:    #else
1271:     CsrMatrix *temp  = new CsrMatrix;
1272:     CsrMatrix *tempT = new CsrMatrix;
1273:     /* First convert HYB to CSR */
1274:     temp->num_rows = A->rmap->n;
1275:     temp->num_cols = A->cmap->n;
1276:     temp->num_entries = a->nz;
1277:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1278:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1279:     temp->values = new THRUSTARRAY(a->nz);

1281:     stat = cusparse_hyb2csr(cusparsestruct->handle,
1282:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1283:                             temp->values->data().get(),
1284:                             temp->row_offsets->data().get(),
1285:                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);

1287:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1288:     tempT->num_rows = A->rmap->n;
1289:     tempT->num_cols = A->cmap->n;
1290:     tempT->num_entries = a->nz;
1291:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1292:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1293:     tempT->values = new THRUSTARRAY(a->nz);

1295:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1296:                             temp->num_cols, temp->num_entries,
1297:                             temp->values->data().get(),
1298:                             temp->row_offsets->data().get(),
1299:                             temp->column_indices->data().get(),
1300:                             tempT->values->data().get(),
1301:                             tempT->column_indices->data().get(),
1302:                             tempT->row_offsets->data().get(),
1303:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

1305:     /* Last, convert CSC to HYB */
1306:     cusparseHybMat_t hybMat;
1307:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1308:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1309:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1310:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1311:                             matstructT->descr, tempT->values->data().get(),
1312:                             tempT->row_offsets->data().get(),
1313:                             tempT->column_indices->data().get(),
1314:                             hybMat, 0, partition);CHKERRCUSPARSE(stat);

1316:     /* assign the pointer */
1317:     matstructT->mat = hybMat;
1318:     /* delete temporaries */
1319:     if (tempT) {
1320:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323:       delete (CsrMatrix*) tempT;
1324:     }
1325:     if (temp) {
1326:       if (temp->values) delete (THRUSTARRAY*) temp->values;
1327:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329:       delete (CsrMatrix*) temp;
1330:     }
1331:    #endif
1332:   }
1333:   err  = WaitForCUDA();CHKERRCUDA(err);
1334:   PetscLogGpuTimeEnd();
1335:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1336:   /* the compressed row indices is not used for matTranspose */
1337:   matstructT->cprowIndices = NULL;
1338:   /* assign the pointer */
1339:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1340:   return(0);
1341: }

1343: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1344: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1345: {
1346:   PetscInt                              n = xx->map->n;
1347:   const PetscScalar                     *barray;
1348:   PetscScalar                           *xarray;
1349:   thrust::device_ptr<const PetscScalar> bGPU;
1350:   thrust::device_ptr<PetscScalar>       xGPU;
1351:   cusparseStatus_t                      stat;
1352:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1353:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1354:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1355:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1356:   PetscErrorCode                        ierr;
1357:   cudaError_t                           cerr;

1360:   /* Analyze the matrix and create the transpose ... on the fly */
1361:   if (!loTriFactorT && !upTriFactorT) {
1362:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1363:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1364:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1365:   }

1367:   /* Get the GPU pointers */
1368:   VecCUDAGetArrayWrite(xx,&xarray);
1369:   VecCUDAGetArrayRead(bb,&barray);
1370:   xGPU = thrust::device_pointer_cast(xarray);
1371:   bGPU = thrust::device_pointer_cast(barray);

1373:   PetscLogGpuTimeBegin();
1374:   /* First, reorder with the row permutation */
1375:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1376:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1377:                xGPU);

1379:   /* First, solve U */
1380:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1381:                         upTriFactorT->csrMat->num_rows,
1382:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1383:                         upTriFactorT->csrMat->num_entries,
1384:                       #endif
1385:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1386:                         upTriFactorT->csrMat->values->data().get(),
1387:                         upTriFactorT->csrMat->row_offsets->data().get(),
1388:                         upTriFactorT->csrMat->column_indices->data().get(),
1389:                         upTriFactorT->solveInfo,
1390:                         xarray, tempGPU->data().get()
1391:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1392:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1393:                       #endif
1394:                         );CHKERRCUSPARSE(stat);

1396:   /* Then, solve L */
1397:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1398:                         loTriFactorT->csrMat->num_rows,
1399:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1400:                         loTriFactorT->csrMat->num_entries,
1401:                       #endif
1402:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1403:                         loTriFactorT->csrMat->values->data().get(),
1404:                         loTriFactorT->csrMat->row_offsets->data().get(),
1405:                         loTriFactorT->csrMat->column_indices->data().get(),
1406:                         loTriFactorT->solveInfo,
1407:                         tempGPU->data().get(), xarray
1408:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1409:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1410:                       #endif
1411:                         );CHKERRCUSPARSE(stat);

1413:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1414:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1415:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1416:                tempGPU->begin());

1418:   /* Copy the temporary to the full solution. */
1419:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1421:   /* restore */
1422:   VecCUDARestoreArrayRead(bb,&barray);
1423:   VecCUDARestoreArrayWrite(xx,&xarray);
1424:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1425:   PetscLogGpuTimeEnd();
1426:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1427:   return(0);
1428: }

1430: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1431: {
1432:   const PetscScalar                 *barray;
1433:   PetscScalar                       *xarray;
1434:   cusparseStatus_t                  stat;
1435:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1436:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1437:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1438:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1439:   PetscErrorCode                    ierr;
1440:   cudaError_t                       cerr;

1443:   /* Analyze the matrix and create the transpose ... on the fly */
1444:   if (!loTriFactorT && !upTriFactorT) {
1445:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1446:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1447:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1448:   }

1450:   /* Get the GPU pointers */
1451:   VecCUDAGetArrayWrite(xx,&xarray);
1452:   VecCUDAGetArrayRead(bb,&barray);

1454:   PetscLogGpuTimeBegin();
1455:   /* First, solve U */
1456:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1457:                         upTriFactorT->csrMat->num_rows,
1458:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1459:                         upTriFactorT->csrMat->num_entries,
1460:                       #endif
1461:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1462:                         upTriFactorT->csrMat->values->data().get(),
1463:                         upTriFactorT->csrMat->row_offsets->data().get(),
1464:                         upTriFactorT->csrMat->column_indices->data().get(),
1465:                         upTriFactorT->solveInfo,
1466:                         barray, tempGPU->data().get()
1467:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1468:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1469:                       #endif
1470:                         );CHKERRCUSPARSE(stat);

1472:   /* Then, solve L */
1473:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1474:                         loTriFactorT->csrMat->num_rows,
1475:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1476:                         loTriFactorT->csrMat->num_entries,
1477:                       #endif
1478:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1479:                         loTriFactorT->csrMat->values->data().get(),
1480:                         loTriFactorT->csrMat->row_offsets->data().get(),
1481:                         loTriFactorT->csrMat->column_indices->data().get(),
1482:                         loTriFactorT->solveInfo,
1483:                         tempGPU->data().get(), xarray
1484:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1485:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1486:                       #endif
1487:                         );CHKERRCUSPARSE(stat);

1489:   /* restore */
1490:   VecCUDARestoreArrayRead(bb,&barray);
1491:   VecCUDARestoreArrayWrite(xx,&xarray);
1492:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1493:   PetscLogGpuTimeEnd();
1494:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1495:   return(0);
1496: }

1498: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1499: {
1500:   const PetscScalar                     *barray;
1501:   PetscScalar                           *xarray;
1502:   thrust::device_ptr<const PetscScalar> bGPU;
1503:   thrust::device_ptr<PetscScalar>       xGPU;
1504:   cusparseStatus_t                      stat;
1505:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1506:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1507:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1508:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1509:   PetscErrorCode                        ierr;
1510:   cudaError_t                           cerr;


1514:   /* Get the GPU pointers */
1515:   VecCUDAGetArrayWrite(xx,&xarray);
1516:   VecCUDAGetArrayRead(bb,&barray);
1517:   xGPU = thrust::device_pointer_cast(xarray);
1518:   bGPU = thrust::device_pointer_cast(barray);

1520:   PetscLogGpuTimeBegin();
1521:   /* First, reorder with the row permutation */
1522:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1523:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1524:                tempGPU->begin());

1526:   /* Next, solve L */
1527:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1528:                         loTriFactor->csrMat->num_rows,
1529:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1530:                         loTriFactor->csrMat->num_entries,
1531:                       #endif
1532:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1533:                         loTriFactor->csrMat->values->data().get(),
1534:                         loTriFactor->csrMat->row_offsets->data().get(),
1535:                         loTriFactor->csrMat->column_indices->data().get(),
1536:                         loTriFactor->solveInfo,
1537:                         tempGPU->data().get(), xarray
1538:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1540:                       #endif
1541:                         );CHKERRCUSPARSE(stat);

1543:   /* Then, solve U */
1544:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1545:                         upTriFactor->csrMat->num_rows,
1546:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1547:                         upTriFactor->csrMat->num_entries,
1548:                       #endif
1549:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1550:                         upTriFactor->csrMat->values->data().get(),
1551:                         upTriFactor->csrMat->row_offsets->data().get(),
1552:                         upTriFactor->csrMat->column_indices->data().get(),
1553:                         upTriFactor->solveInfo,
1554:                         xarray, tempGPU->data().get()
1555:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1556:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1557:                       #endif
1558:                         );CHKERRCUSPARSE(stat);

1560:   /* Last, reorder with the column permutation */
1561:   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1562:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1563:                xGPU);

1565:   VecCUDARestoreArrayRead(bb,&barray);
1566:   VecCUDARestoreArrayWrite(xx,&xarray);
1567:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1568:   PetscLogGpuTimeEnd();
1569:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1570:   return(0);
1571: }

1573: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1574: {
1575:   const PetscScalar                 *barray;
1576:   PetscScalar                       *xarray;
1577:   cusparseStatus_t                  stat;
1578:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1579:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1580:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1581:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1582:   PetscErrorCode                    ierr;
1583:   cudaError_t                       cerr;

1586:   /* Get the GPU pointers */
1587:   VecCUDAGetArrayWrite(xx,&xarray);
1588:   VecCUDAGetArrayRead(bb,&barray);

1590:   PetscLogGpuTimeBegin();
1591:   /* First, solve L */
1592:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1593:                         loTriFactor->csrMat->num_rows,
1594:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1595:                         loTriFactor->csrMat->num_entries,
1596:                       #endif
1597:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1598:                         loTriFactor->csrMat->values->data().get(),
1599:                         loTriFactor->csrMat->row_offsets->data().get(),
1600:                         loTriFactor->csrMat->column_indices->data().get(),
1601:                         loTriFactor->solveInfo,
1602:                         barray, tempGPU->data().get()
1603:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1604:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1605:                       #endif
1606:                         );CHKERRCUSPARSE(stat);

1608:   /* Next, solve U */
1609:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1610:                         upTriFactor->csrMat->num_rows,
1611:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1612:                         upTriFactor->csrMat->num_entries,
1613:                       #endif
1614:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1615:                         upTriFactor->csrMat->values->data().get(),
1616:                         upTriFactor->csrMat->row_offsets->data().get(),
1617:                         upTriFactor->csrMat->column_indices->data().get(),
1618:                         upTriFactor->solveInfo,
1619:                         tempGPU->data().get(), xarray
1620:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1621:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1622:                       #endif
1623:                         );CHKERRCUSPARSE(stat);

1625:   VecCUDARestoreArrayRead(bb,&barray);
1626:   VecCUDARestoreArrayWrite(xx,&xarray);
1627:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1628:   PetscLogGpuTimeEnd();
1629:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1630:   return(0);
1631: }

1633: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1634: {
1635:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1636:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1637:   cudaError_t        cerr;
1638:   PetscErrorCode     ierr;

1641:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1642:     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;

1644:     PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1645:     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1646:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1647:     PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1648:     PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1649:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1650:   }
1651:   return(0);
1652: }

1654: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1655: {
1656:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

1660:   MatSeqAIJCUSPARSECopyFromGPU(A);
1661:   *array = a->a;
1662:   A->offloadmask = PETSC_OFFLOAD_CPU;
1663:   return(0);
1664: }

1666: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1667: {
1668:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1669:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1670:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1671:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1672:   PetscErrorCode               ierr;
1673:   cusparseStatus_t             stat;
1674:   cudaError_t                  err;

1677:   if (A->boundtocpu) return(0);
1678:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1679:     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1680:       /* Copy values only */
1681:       CsrMatrix *matrix,*matrixT;
1682:       matrix = (CsrMatrix*)cusparsestruct->mat->mat;

1684:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1685:       matrix->values->assign(a->a, a->a+a->nz);
1686:       err  = WaitForCUDA();CHKERRCUDA(err);
1687:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1688:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);

1690:       /* Update matT when it was built before */
1691:       if (cusparsestruct->matTranspose) {
1692:         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1693:         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1694:         PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1695:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1696:                             A->cmap->n, matrix->num_entries,
1697:                             matrix->values->data().get(),
1698:                             cusparsestruct->rowoffsets_gpu->data().get(),
1699:                             matrix->column_indices->data().get(),
1700:                             matrixT->values->data().get(),
1701:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1702:                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1703:                             CUSPARSE_ACTION_NUMERIC,indexBase,
1704:                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1705:                           #else
1706:                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1707:                             CUSPARSE_ACTION_NUMERIC, indexBase
1708:                           #endif
1709:                            );CHKERRCUSPARSE(stat);
1710:         err  = WaitForCUDA();CHKERRCUDA(err);
1711:         PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1712:       }
1713:     } else {
1714:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1715:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1716:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1717:       delete cusparsestruct->workVector;
1718:       delete cusparsestruct->rowoffsets_gpu;
1719:       try {
1720:         if (a->compressedrow.use) {
1721:           m    = a->compressedrow.nrows;
1722:           ii   = a->compressedrow.i;
1723:           ridx = a->compressedrow.rindex;
1724:         } else {
1725:           m    = A->rmap->n;
1726:           ii   = a->i;
1727:           ridx = NULL;
1728:         }
1729:         cusparsestruct->nrows = m;

1731:         /* create cusparse matrix */
1732:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1733:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1734:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1735:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1737:         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1738:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1739:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1740:         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1741:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1742:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1743:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1745:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1746:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1747:           /* set the matrix */
1748:           CsrMatrix *mat= new CsrMatrix;
1749:           mat->num_rows = m;
1750:           mat->num_cols = A->cmap->n;
1751:           mat->num_entries = a->nz;
1752:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1753:           mat->row_offsets->assign(ii, ii + m+1);

1755:           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1756:           mat->column_indices->assign(a->j, a->j+a->nz);

1758:           mat->values = new THRUSTARRAY(a->nz);
1759:           mat->values->assign(a->a, a->a+a->nz);

1761:           /* assign the pointer */
1762:           matstruct->mat = mat;
1763:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1764:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1765:             stat = cusparseCreateCsr(&matstruct->matDescr,
1766:                                     mat->num_rows, mat->num_cols, mat->num_entries,
1767:                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1768:                                     mat->values->data().get(),
1769:                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1770:                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1771:           }
1772:          #endif
1773:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1774:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1775:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1776:          #else
1777:           CsrMatrix *mat= new CsrMatrix;
1778:           mat->num_rows = m;
1779:           mat->num_cols = A->cmap->n;
1780:           mat->num_entries = a->nz;
1781:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1782:           mat->row_offsets->assign(ii, ii + m+1);

1784:           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1785:           mat->column_indices->assign(a->j, a->j+a->nz);

1787:           mat->values = new THRUSTARRAY(a->nz);
1788:           mat->values->assign(a->a, a->a+a->nz);

1790:           cusparseHybMat_t hybMat;
1791:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1792:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1793:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1794:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1795:               matstruct->descr, mat->values->data().get(),
1796:               mat->row_offsets->data().get(),
1797:               mat->column_indices->data().get(),
1798:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1799:           /* assign the pointer */
1800:           matstruct->mat = hybMat;

1802:           if (mat) {
1803:             if (mat->values) delete (THRUSTARRAY*)mat->values;
1804:             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1805:             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1806:             delete (CsrMatrix*)mat;
1807:           }
1808:          #endif
1809:         }

1811:         /* assign the compressed row indices */
1812:         if (a->compressedrow.use) {
1813:           cusparsestruct->workVector = new THRUSTARRAY(m);
1814:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1815:           matstruct->cprowIndices->assign(ridx,ridx+m);
1816:           tmp = m;
1817:         } else {
1818:           cusparsestruct->workVector = NULL;
1819:           matstruct->cprowIndices    = NULL;
1820:           tmp = 0;
1821:         }
1822:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1824:         /* assign the pointer */
1825:         cusparsestruct->mat = matstruct;
1826:       } catch(char *ex) {
1827:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1828:       }
1829:       err  = WaitForCUDA();CHKERRCUDA(err);
1830:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1831:       cusparsestruct->nonzerostate = A->nonzerostate;
1832:     }
1833:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1834:   }
1835:   return(0);
1836: }

1838: struct VecCUDAPlusEquals
1839: {
1840:   template <typename Tuple>
1841:   __host__ __device__
1842:   void operator()(Tuple t)
1843:   {
1844:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1845:   }
1846: };

1848: struct VecCUDAEquals
1849: {
1850:   template <typename Tuple>
1851:   __host__ __device__
1852:   void operator()(Tuple t)
1853:   {
1854:     thrust::get<1>(t) = thrust::get<0>(t);
1855:   }
1856: };

1858: struct VecCUDAEqualsReverse
1859: {
1860:   template <typename Tuple>
1861:   __host__ __device__
1862:   void operator()(Tuple t)
1863:   {
1864:     thrust::get<0>(t) = thrust::get<1>(t);
1865:   }
1866: };

1868: struct MatMatCusparse {
1869:   PetscBool            cisdense;
1870:   PetscScalar          *Bt;
1871:   Mat                  X;
1872: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1873:   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1874:   cusparseDnMatDescr_t matBDescr;
1875:   cusparseDnMatDescr_t matCDescr;
1876:   size_t               spmmBufferSize;
1877:   void                 *spmmBuffer;
1878:   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1879: #endif
1880: };

1882: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1883: {
1885:   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1886:   cudaError_t    cerr;

1889:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1890:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1891:   cusparseStatus_t stat;
1892:   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1893:   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1894:   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1895:  #endif
1896:   MatDestroy(&mmdata->X);
1897:   PetscFree(data);
1898:   return(0);
1899: }

1901: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

1903: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1904: {
1905:   Mat_Product                  *product = C->product;
1906:   Mat                          A,B;
1907:   PetscInt                     m,n,blda,clda;
1908:   PetscBool                    flg,biscuda;
1909:   Mat_SeqAIJCUSPARSE           *cusp;
1910:   cusparseStatus_t             stat;
1911:   cusparseOperation_t          opA;
1912:   const PetscScalar            *barray;
1913:   PetscScalar                  *carray;
1914:   PetscErrorCode               ierr;
1915:   MatMatCusparse               *mmdata;
1916:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1917:   CsrMatrix                    *csrmat;
1918:   cudaError_t                  cerr;

1921:   MatCheckProduct(C,1);
1922:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1923:   mmdata = (MatMatCusparse*)product->data;
1924:   A    = product->A;
1925:   B    = product->B;
1926:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1927:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1928:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1929:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1930:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1931:   MatSeqAIJCUSPARSECopyToGPU(A);
1932:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1933:   switch (product->type) {
1934:   case MATPRODUCT_AB:
1935:   case MATPRODUCT_PtAP:
1936:     mat = cusp->mat;
1937:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1938:     m   = A->rmap->n;
1939:     n   = B->cmap->n;
1940:     break;
1941:   case MATPRODUCT_AtB:
1942:     if (!cusp->transgen) {
1943:       mat = cusp->mat;
1944:       opA = CUSPARSE_OPERATION_TRANSPOSE;
1945:     } else {
1946:       MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1947:       mat  = cusp->matTranspose;
1948:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1949:     }
1950:     m = A->cmap->n;
1951:     n = B->cmap->n;
1952:     break;
1953:   case MATPRODUCT_ABt:
1954:   case MATPRODUCT_RARt:
1955:     mat = cusp->mat;
1956:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1957:     m   = A->rmap->n;
1958:     n   = B->rmap->n;
1959:     break;
1960:   default:
1961:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1962:   }
1963:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1964:   csrmat = (CsrMatrix*)mat->mat;
1965:   /* if the user passed a CPU matrix, copy the data to the GPU */
1966:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
1967:   if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
1968:   MatDenseCUDAGetArrayRead(B,&barray);

1970:   MatDenseGetLDA(B,&blda);
1971:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1972:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
1973:     MatDenseGetLDA(mmdata->X,&clda);
1974:   } else {
1975:     MatDenseCUDAGetArrayWrite(C,&carray);
1976:     MatDenseGetLDA(C,&clda);
1977:   }

1979:   PetscLogGpuTimeBegin();
1980:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1981:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1982:   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1983:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1984:     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1985:     if (!mmdata->matBDescr) {
1986:       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1987:       mmdata->Blda = blda;
1988:     }

1990:     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1991:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1992:       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1993:       mmdata->Clda = clda;
1994:     }

1996:     if (!mat->matDescr) {
1997:       stat = cusparseCreateCsr(&mat->matDescr,
1998:                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1999:                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2000:                               csrmat->values->data().get(),
2001:                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2002:                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2003:     }
2004:     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2005:                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2006:                                    mmdata->matCDescr,cusparse_scalartype,
2007:                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
2008:     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
2009:     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
2010:     mmdata->initialized = PETSC_TRUE;
2011:   } else {
2012:     /* to be safe, always update pointers of the mats */
2013:     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2014:     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2015:     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2016:   }

2018:   /* do cusparseSpMM, which supports transpose on B */
2019:   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2020:                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2021:                       mmdata->matCDescr,cusparse_scalartype,
2022:                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
2023:  #else
2024:   PetscInt k;
2025:   /* cusparseXcsrmm does not support transpose on B */
2026:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2027:     cublasHandle_t cublasv2handle;
2028:     cublasStatus_t cerr;

2030:     PetscCUBLASGetHandle(&cublasv2handle);
2031:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2032:                        B->cmap->n,B->rmap->n,
2033:                        &PETSC_CUSPARSE_ONE ,barray,blda,
2034:                        &PETSC_CUSPARSE_ZERO,barray,blda,
2035:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2036:     blda = B->cmap->n;
2037:     k    = B->cmap->n;
2038:   } else {
2039:     k    = B->rmap->n;
2040:   }

2042:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2043:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2044:                            csrmat->num_entries,mat->alpha_one,mat->descr,
2045:                            csrmat->values->data().get(),
2046:                            csrmat->row_offsets->data().get(),
2047:                            csrmat->column_indices->data().get(),
2048:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2049:                            carray,clda);CHKERRCUSPARSE(stat);
2050:  #endif
2051:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2052:   PetscLogGpuTimeEnd();
2053:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2054:   MatDenseCUDARestoreArrayRead(B,&barray);
2055:   if (product->type == MATPRODUCT_RARt) {
2056:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2057:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2058:   } else if (product->type == MATPRODUCT_PtAP) {
2059:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2060:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2061:   } else {
2062:     MatDenseCUDARestoreArrayWrite(C,&carray);
2063:   }
2064:   if (mmdata->cisdense) {
2065:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2066:   }
2067:   if (!biscuda) {
2068:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2069:   }
2070:   return(0);
2071: }

2073: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2074: {
2075:   Mat_Product        *product = C->product;
2076:   Mat                A,B;
2077:   PetscInt           m,n;
2078:   PetscBool          cisdense,flg;
2079:   PetscErrorCode     ierr;
2080:   MatMatCusparse     *mmdata;
2081:   Mat_SeqAIJCUSPARSE *cusp;

2084:   MatCheckProduct(C,1);
2085:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2086:   A    = product->A;
2087:   B    = product->B;
2088:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2089:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2090:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2091:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2092:   switch (product->type) {
2093:   case MATPRODUCT_AB:
2094:     m = A->rmap->n;
2095:     n = B->cmap->n;
2096:     break;
2097:   case MATPRODUCT_AtB:
2098:     m = A->cmap->n;
2099:     n = B->cmap->n;
2100:     break;
2101:   case MATPRODUCT_ABt:
2102:     m = A->rmap->n;
2103:     n = B->rmap->n;
2104:     break;
2105:   case MATPRODUCT_PtAP:
2106:     m = B->cmap->n;
2107:     n = B->cmap->n;
2108:     break;
2109:   case MATPRODUCT_RARt:
2110:     m = B->rmap->n;
2111:     n = B->rmap->n;
2112:     break;
2113:   default:
2114:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2115:   }
2116:   MatSetSizes(C,m,n,m,n);
2117:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2118:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2119:   MatSetType(C,MATSEQDENSECUDA);

2121:   /* product data */
2122:   PetscNew(&mmdata);
2123:   mmdata->cisdense = cisdense;
2124:  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2125:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2126:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2127:     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2128:   }
2129:  #endif
2130:   /* for these products we need intermediate storage */
2131:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2132:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2133:     MatSetType(mmdata->X,MATSEQDENSECUDA);
2134:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2135:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2136:     } else {
2137:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2138:     }
2139:   }
2140:   C->product->data    = mmdata;
2141:   C->product->destroy = MatDestroy_MatMatCusparse;

2143:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2144:   return(0);
2145: }

2147: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2149: /* handles dense B */
2150: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2151: {
2152:   Mat_Product    *product = C->product;

2156:   MatCheckProduct(C,1);
2157:   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2158:   if (product->A->boundtocpu) {
2159:     MatProductSetFromOptions_SeqAIJ_SeqDense(C);
2160:     return(0);
2161:   }
2162:   switch (product->type) {
2163:   case MATPRODUCT_AB:
2164:   case MATPRODUCT_AtB:
2165:   case MATPRODUCT_ABt:
2166:   case MATPRODUCT_PtAP:
2167:   case MATPRODUCT_RARt:
2168:     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2169:   default:
2170:     break;
2171:   }
2172:   return(0);
2173: }

2175: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2176: {

2180:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2181:   return(0);
2182: }

2184: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2185: {

2189:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2190:   return(0);
2191: }

2193: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2194: {

2198:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2199:   return(0);
2200: }

2202: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2203: {

2207:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2208:   return(0);
2209: }

2211: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2212: {

2216:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2217:   return(0);
2218: }

2220: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2221: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2222: {
2223:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2224:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2225:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2226:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2227:   PetscErrorCode               ierr;
2228:   cudaError_t                  cerr;
2229:   cusparseStatus_t             stat;
2230:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2231:   PetscBool                    compressed;
2232: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2233:   PetscInt                     nx,ny;
2234: #endif

2237:   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2238:   if (!a->nonzerorowcnt) {
2239:     if (!yy) {VecSet_SeqCUDA(zz,0);}
2240:     else {VecCopy_SeqCUDA(yy,zz);}
2241:     return(0);
2242:   }
2243:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2244:   MatSeqAIJCUSPARSECopyToGPU(A);
2245:   if (!trans) {
2246:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2247:     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2248:   } else {
2249:     if (herm || !cusparsestruct->transgen) {
2250:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2251:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2252:     } else {
2253:       if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEGenerateTransposeForMult(A);}
2254:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2255:     }
2256:   }
2257:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2258:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2260:   try {
2261:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2262:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2263:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */

2265:     PetscLogGpuTimeBegin();
2266:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2267:       /* z = A x + beta y.
2268:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2269:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2270:       */
2271:       xptr = xarray;
2272:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2273:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2274:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2275:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2276:           allocated to accommodate different uses. So we get the length info directly from mat.
2277:        */
2278:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2279:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2280:         nx = mat->num_cols;
2281:         ny = mat->num_rows;
2282:       }
2283:      #endif
2284:     } else {
2285:       /* z = A^T x + beta y
2286:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2287:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2288:        */
2289:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2290:       dptr = zarray;
2291:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2292:       if (compressed) { /* Scatter x to work vector */
2293:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2294:         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2295:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2296:                          VecCUDAEqualsReverse());
2297:       }
2298:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2299:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2300:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2301:         nx = mat->num_rows;
2302:         ny = mat->num_cols;
2303:       }
2304:      #endif
2305:     }

2307:     /* csr_spmv does y = alpha op(A) x + beta y */
2308:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2309:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2310:       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2311:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2312:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2313:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2314:         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2315:                                 matstruct->matDescr,
2316:                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2317:                                 matstruct->cuSpMV[opA].vecYDescr,
2318:                                 cusparse_scalartype,
2319:                                 cusparsestruct->spmvAlg,
2320:                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2321:         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);

2323:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2324:       } else {
2325:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2326:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2327:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2328:       }

2330:       stat = cusparseSpMV(cusparsestruct->handle, opA,
2331:                                matstruct->alpha_one,
2332:                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2333:                                matstruct->cuSpMV[opA].vecXDescr,
2334:                                beta,
2335:                                matstruct->cuSpMV[opA].vecYDescr,
2336:                                cusparse_scalartype,
2337:                                cusparsestruct->spmvAlg,
2338:                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2339:      #else
2340:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2341:       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2342:                                mat->num_rows, mat->num_cols,
2343:                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2344:                                mat->values->data().get(), mat->row_offsets->data().get(),
2345:                                mat->column_indices->data().get(), xptr, beta,
2346:                                dptr);CHKERRCUSPARSE(stat);
2347:      #endif
2348:     } else {
2349:       if (cusparsestruct->nrows) {
2350:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2351:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2352:        #else
2353:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2354:         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2355:                                  matstruct->alpha_one, matstruct->descr, hybMat,
2356:                                  xptr, beta,
2357:                                  dptr);CHKERRCUSPARSE(stat);
2358:        #endif
2359:       }
2360:     }
2361:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2362:     PetscLogGpuTimeEnd();

2364:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2365:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2366:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2367:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2368:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2369:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2370:         }
2371:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2372:         VecSet_SeqCUDA(zz,0);
2373:       }

2375:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2376:       if (compressed) {
2377:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2378:         PetscLogGpuTimeBegin();
2379:         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2380:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2381:                          VecCUDAPlusEquals());
2382:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2383:         PetscLogGpuTimeEnd();
2384:       }
2385:     } else {
2386:       if (yy && yy != zz) {
2387:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2388:       }
2389:     }
2390:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
2391:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
2392:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
2393:   } catch(char *ex) {
2394:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2395:   }
2396:   if (yy) {
2397:     PetscLogGpuFlops(2.0*a->nz);
2398:   } else {
2399:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
2400:   }
2401:   return(0);
2402: }

2404: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2405: {

2409:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
2410:   return(0);
2411: }

2413: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2414: {
2415:   PetscErrorCode              ierr;
2416:   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2417:   PetscBool                   is_seq = PETSC_TRUE;
2418:   PetscInt                    nnz_state = A->nonzerostate;
2420:   if (A->factortype == MAT_FACTOR_NONE) {
2421:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2422:   }
2423:   if (d_mat) {
2424:     cudaError_t err;
2425:     PetscInfo(A,"Assemble device matrix\n");
2426:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2427:     nnz_state = h_mat.nonzerostate;
2428:     is_seq = h_mat.seq;
2429:   }
2430:   MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
2431:   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
2432:   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2433:     MatSeqAIJCUSPARSECopyToGPU(A);
2434:   } else if (nnz_state > A->nonzerostate) {
2435:     A->offloadmask = PETSC_OFFLOAD_GPU;
2436:   }

2438:   return(0);
2439: }

2441: /* --------------------------------------------------------------------------------*/
2442: /*@
2443:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2444:    (the default parallel PETSc format). This matrix will ultimately pushed down
2445:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2446:    assembly performance the user should preallocate the matrix storage by setting
2447:    the parameter nz (or the array nnz).  By setting these parameters accurately,
2448:    performance during matrix assembly can be increased by more than a factor of 50.

2450:    Collective

2452:    Input Parameters:
2453: +  comm - MPI communicator, set to PETSC_COMM_SELF
2454: .  m - number of rows
2455: .  n - number of columns
2456: .  nz - number of nonzeros per row (same for all rows)
2457: -  nnz - array containing the number of nonzeros in the various rows
2458:          (possibly different for each row) or NULL

2460:    Output Parameter:
2461: .  A - the matrix

2463:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2464:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2465:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

2467:    Notes:
2468:    If nnz is given then nz is ignored

2470:    The AIJ format (also called the Yale sparse matrix format or
2471:    compressed row storage), is fully compatible with standard Fortran 77
2472:    storage.  That is, the stored row and column indices can begin at
2473:    either one (as in Fortran) or zero.  See the users' manual for details.

2475:    Specify the preallocated storage with either nz or nnz (not both).
2476:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2477:    allocation.  For large problems you MUST preallocate memory or you
2478:    will get TERRIBLE performance, see the users' manual chapter on matrices.

2480:    By default, this format uses inodes (identical nodes) when possible, to
2481:    improve numerical efficiency of matrix-vector products and solves. We
2482:    search for consecutive rows with the same nonzero structure, thereby
2483:    reusing matrix information to achieve increased efficiency.

2485:    Level: intermediate

2487: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2488: @*/
2489: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2490: {

2494:   MatCreate(comm,A);
2495:   MatSetSizes(*A,m,n,m,n);
2496:   MatSetType(*A,MATSEQAIJCUSPARSE);
2497:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
2498:   return(0);
2499: }

2501: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2502: {
2503:   PetscErrorCode              ierr;
2504:   PetscSplitCSRDataStructure  *d_mat = NULL;

2507:   if (A->factortype == MAT_FACTOR_NONE) {
2508:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2509:     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2510:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
2511:   } else {
2512:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
2513:   }
2514:   if (d_mat) {
2515:     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2516:     cudaError_t                err;
2517:     PetscSplitCSRDataStructure h_mat;
2518:     PetscInfo(A,"Have device matrix\n");
2519:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2520:     if (h_mat.seq) {
2521:       if (a->compressedrow.use) {
2522:          err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2523:       }
2524:       err = cudaFree(d_mat);CHKERRCUDA(err);
2525:     }
2526:   }
2527:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
2528:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2529:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2530:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
2531:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
2532:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
2533:   MatDestroy_SeqAIJ(A);
2534:   return(0);
2535: }

2537: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2538: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2539: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2540: {

2544:   MatDuplicate_SeqAIJ(A,cpvalues,B);
2545:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
2546:   return(0);
2547: }

2549: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2550: {
2551:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

2555:   if (A->factortype != MAT_FACTOR_NONE) return(0);
2556:   if (flg) {
2557:     MatSeqAIJCUSPARSECopyFromGPU(A);

2559:     A->ops->mult                      = MatMult_SeqAIJ;
2560:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2561:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2562:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2563:     A->ops->multhermitiantranspose    = NULL;
2564:     A->ops->multhermitiantransposeadd = NULL;
2565:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2566:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2567:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
2568:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
2569:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
2570:   } else {
2571:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2572:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2573:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2574:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2575:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2576:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2577:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2578:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2579:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
2580:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
2581:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
2582:   }
2583:   A->boundtocpu = flg;
2584:   a->inode.use = flg;
2585:   return(0);
2586: }

2588: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2589: {
2590:   PetscSplitCSRDataStructure *d_mat = NULL;
2591:   PetscErrorCode             ierr;
2592:   PetscBool                  both = PETSC_FALSE;

2595:   if (A->factortype == MAT_FACTOR_NONE) {
2596:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2597:     if (spptr->mat) {
2598:       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
2599:       if (matrix->values) {
2600:         both = PETSC_TRUE;
2601:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2602:       }
2603:     }
2604:     if (spptr->matTranspose) {
2605:       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
2606:       if (matrix->values) {
2607:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2608:       }
2609:     }
2610:     d_mat = spptr->deviceMat;
2611:   }
2612:   if (d_mat) {
2613:     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2614:     PetscInt     n = A->rmap->n, nnz = a->i[n];
2615:     cudaError_t  err;
2616:     PetscScalar  *vals;
2617:     PetscInfo(A,"Zero device matrix\n");
2618:     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2619:     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2620:   }
2621:   MatZeroEntries_SeqAIJ(A);
2622:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;

2624:   return(0);
2625: }

2627: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2628: {
2629:   PetscErrorCode   ierr;
2630:   cusparseStatus_t stat;
2631:   Mat              B;

2634:   if (reuse == MAT_INITIAL_MATRIX) {
2635:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
2636:   } else if (reuse == MAT_REUSE_MATRIX) {
2637:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
2638:   }
2639:   B = *newmat;

2641:   PetscFree(B->defaultvectype);
2642:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

2644:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2645:     if (B->factortype == MAT_FACTOR_NONE) {
2646:       Mat_SeqAIJCUSPARSE *spptr;

2648:       PetscNew(&spptr);
2649:       spptr->format = MAT_CUSPARSE_CSR;
2650:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2651:       B->spptr = spptr;
2652:       spptr->deviceMat = NULL;
2653:     } else {
2654:       Mat_SeqAIJCUSPARSETriFactors *spptr;

2656:       PetscNew(&spptr);
2657:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2658:       B->spptr = spptr;
2659:     }
2660:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2661:   }
2662:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2663:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2664:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2665:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2666:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2667:   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;

2669:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
2670:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
2671:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
2672:   return(0);
2673: }

2675: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2676: {

2680:   PetscCUDAInitializeCheck();
2681:   MatCreate_SeqAIJ(B);
2682:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
2683:   PetscObjectOptionsBegin((PetscObject)B);
2684:   MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);
2685:   PetscOptionsEnd();
2686:   return(0);
2687: }

2689: /*MC
2690:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

2692:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2693:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2694:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

2696:    Options Database Keys:
2697: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2698: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2699: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

2701:   Level: beginner

2703: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2704: M*/

2706: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


2709: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2710: {

2714:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
2715:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
2716:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
2717:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
2718:   return(0);
2719: }

2721: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2722: {
2723:   PetscErrorCode   ierr;
2724:   cusparseStatus_t stat;

2727:   if (*cusparsestruct) {
2728:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2729:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2730:     delete (*cusparsestruct)->workVector;
2731:     delete (*cusparsestruct)->rowoffsets_gpu;
2732:     delete (*cusparsestruct)->cooPerm;
2733:     delete (*cusparsestruct)->cooPerm_a;
2734:     delete (*cusparsestruct)->cooPerm_v;
2735:     delete (*cusparsestruct)->cooPerm_w;
2736:     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2737:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2738:     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2739:    #endif
2740:     PetscFree(*cusparsestruct);
2741:   }
2742:   return(0);
2743: }

2745: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2746: {
2748:   if (*mat) {
2749:     delete (*mat)->values;
2750:     delete (*mat)->column_indices;
2751:     delete (*mat)->row_offsets;
2752:     delete *mat;
2753:     *mat = 0;
2754:   }
2755:   return(0);
2756: }

2758: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2759: {
2760:   cusparseStatus_t stat;
2761:   PetscErrorCode   ierr;

2764:   if (*trifactor) {
2765:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2766:     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2767:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
2768:     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2769:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2770:     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2771:    #endif
2772:     PetscFree(*trifactor);
2773:   }
2774:   return(0);
2775: }

2777: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2778: {
2779:   CsrMatrix        *mat;
2780:   cusparseStatus_t stat;
2781:   cudaError_t      err;

2784:   if (*matstruct) {
2785:     if ((*matstruct)->mat) {
2786:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2787:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2788:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2789:        #else
2790:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2791:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2792:        #endif
2793:       } else {
2794:         mat = (CsrMatrix*)(*matstruct)->mat;
2795:         CsrMatrix_Destroy(&mat);
2796:       }
2797:     }
2798:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2799:     delete (*matstruct)->cprowIndices;
2800:     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2801:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2802:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }

2804:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2805:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2806:     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2807:     for (int i=0; i<3; i++) {
2808:       if (mdata->cuSpMV[i].initialized) {
2809:         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2810:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2811:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2812:       }
2813:     }
2814:    #endif
2815:     delete *matstruct;
2816:     *matstruct = NULL;
2817:   }
2818:   return(0);
2819: }

2821: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2822: {

2826:   if (*trifactors) {
2827:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2828:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2829:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2830:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2831:     delete (*trifactors)->rpermIndices;
2832:     delete (*trifactors)->cpermIndices;
2833:     delete (*trifactors)->workVector;
2834:     (*trifactors)->rpermIndices = NULL;
2835:     (*trifactors)->cpermIndices = NULL;
2836:     (*trifactors)->workVector = NULL;
2837:   }
2838:   return(0);
2839: }

2841: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2842: {
2843:   PetscErrorCode   ierr;
2844:   cusparseHandle_t handle;
2845:   cusparseStatus_t stat;

2848:   if (*trifactors) {
2849:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2850:     if (handle = (*trifactors)->handle) {
2851:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2852:     }
2853:     PetscFree(*trifactors);
2854:   }
2855:   return(0);
2856: }

2858: struct IJCompare
2859: {
2860:   __host__ __device__
2861:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2862:   {
2863:     if (t1.get<0>() < t2.get<0>()) return true;
2864:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
2865:     return false;
2866:   }
2867: };

2869: struct IJEqual
2870: {
2871:   __host__ __device__
2872:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2873:   {
2874:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
2875:     return true;
2876:   }
2877: };

2879: struct IJDiff
2880: {
2881:   __host__ __device__
2882:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2883:   {
2884:     return t1 == t2 ? 0 : 1;
2885:   }
2886: };

2888: struct IJSum
2889: {
2890:   __host__ __device__
2891:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2892:   {
2893:     return t1||t2;
2894:   }
2895: };

2897: #include <thrust/iterator/discard_iterator.h>
2898: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
2899: {
2900:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2901:   CsrMatrix          *matrix;
2902:   PetscErrorCode     ierr;
2903:   cudaError_t        cerr;
2904:   PetscInt           n;

2907:   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
2908:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
2909:   if (!cusp->cooPerm) {
2910:     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
2911:     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
2912:     return(0);
2913:   }
2914:   n = cusp->cooPerm->size();
2915:   matrix = (CsrMatrix*)cusp->mat->mat;
2916:   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2917:   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
2918:   PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);
2919:   if (v) {
2920:     cusp->cooPerm_v->assign(v,v+n);
2921:     PetscLogCpuToGpu(n*sizeof(PetscScalar));
2922:   }
2923:   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
2924:   if (imode == ADD_VALUES) {
2925:     if (cusp->cooPerm_a) {
2926:       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
2927:       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
2928:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
2929:       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
2930:     } else {
2931:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2932:                                                                 matrix->values->begin()));
2933:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2934:                                                                 matrix->values->end()));
2935:       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
2936:     }
2937:   } else {
2938:     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
2939:                               if we are inserting two different values into the same location */
2940:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2941:                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
2942:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2943:                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
2944:       thrust::for_each(zibit,zieit,VecCUDAEquals());
2945:     } else {
2946:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2947:                                                                 matrix->values->begin()));
2948:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2949:                                                                 matrix->values->end()));
2950:       thrust::for_each(zibit,zieit,VecCUDAEquals());
2951:     }
2952:   }
2953:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2954:   PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);
2955:   A->offloadmask = PETSC_OFFLOAD_GPU;
2956:   MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
2957:   MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
2958:   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
2959:   MatSeqAIJCUSPARSECopyFromGPU(A);
2960:   return(0);
2961: }

2963: #include <thrust/binary_search.h>
2964: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
2965: {
2966:   PetscErrorCode     ierr;
2967:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2968:   CsrMatrix          *matrix;
2969:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2970:   PetscInt           cooPerm_n, nzr = 0;
2971:   cudaError_t        cerr;

2974:   PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);
2975:   PetscLayoutSetUp(A->rmap);
2976:   PetscLayoutSetUp(A->cmap);
2977:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
2978:   if (n != cooPerm_n) {
2979:     delete cusp->cooPerm;
2980:     delete cusp->cooPerm_v;
2981:     delete cusp->cooPerm_w;
2982:     delete cusp->cooPerm_a;
2983:     cusp->cooPerm = NULL;
2984:     cusp->cooPerm_v = NULL;
2985:     cusp->cooPerm_w = NULL;
2986:     cusp->cooPerm_a = NULL;
2987:   }
2988:   if (n) {
2989:     THRUSTINTARRAY d_i(n);
2990:     THRUSTINTARRAY d_j(n);
2991:     THRUSTINTARRAY ii(A->rmap->n);

2993:     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
2994:     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }

2996:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
2997:     d_i.assign(coo_i,coo_i+n);
2998:     d_j.assign(coo_j,coo_j+n);
2999:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3000:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));

3002:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3003:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3004:     *cusp->cooPerm_a = d_i;
3005:     THRUSTINTARRAY w = d_j;

3007:     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3008:     if (nekey == ekey) { /* all entries are unique */
3009:       delete cusp->cooPerm_a;
3010:       cusp->cooPerm_a = NULL;
3011:     } else { /* I couldn't come up with a more elegant algorithm */
3012:       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3013:       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3014:       (*cusp->cooPerm_a)[0] = 0;
3015:       w[0] = 0;
3016:       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3017:       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3018:     }
3019:     thrust::counting_iterator<PetscInt> search_begin(0);
3020:     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3021:                         search_begin, search_begin + A->rmap->n,
3022:                         ii.begin());

3024:     MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3025:     a->singlemalloc = PETSC_FALSE;
3026:     a->free_a       = PETSC_TRUE;
3027:     a->free_ij      = PETSC_TRUE;
3028:     PetscMalloc1(A->rmap->n+1,&a->i);
3029:     a->i[0] = 0;
3030:     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3031:     a->nz = a->maxnz = a->i[A->rmap->n];
3032:     PetscMalloc1(a->nz,&a->a);
3033:     PetscMalloc1(a->nz,&a->j);
3034:     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3035:     if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3036:     if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3037:     for (PetscInt i = 0; i < A->rmap->n; i++) {
3038:       const PetscInt nnzr = a->i[i+1] - a->i[i];
3039:       nzr += (PetscInt)!!(nnzr);
3040:       a->ilen[i] = a->imax[i] = nnzr;
3041:     }
3042:     A->preallocated = PETSC_TRUE;
3043:     PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3044:   } else {
3045:     MatSeqAIJSetPreallocation(A,0,NULL);
3046:   }
3047:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3048:   PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);

3050:   /* We want to allocate the CUSPARSE struct for matvec now.
3051:      The code is so convoluted now that I prefer to copy garbage to the GPU */
3052:   MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3053:   A->offloadmask = PETSC_OFFLOAD_CPU;
3054:   A->nonzerostate++;
3055:   MatSeqAIJCUSPARSECopyToGPU(A);
3056:   {
3057:     matrix = (CsrMatrix*)cusp->mat->mat;
3058:     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3059:     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3060:     MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3061:   }

3063:   A->offloadmask = PETSC_OFFLOAD_CPU;
3064:   A->assembled = PETSC_FALSE;
3065:   A->was_assembled = PETSC_FALSE;
3066:   return(0);
3067: }