Actual source code: aijcusparse.cu

petsc-master 2020-08-09
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10:  #include <../src/mat/impls/aij/seq/aij.h>
 11:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12:  #include <../src/vec/vec/impls/dvecimpl.h>
 13:  #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15:  #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 19: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 20: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 21: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 23: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 24: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 25: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 27: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 28: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 29: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 30: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 31: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 32: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 33: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 34: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 35: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 36: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 37: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 38: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 40: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 41: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 42: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 43: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
 44: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 45: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 47: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 48: {
 49:   cusparseStatus_t   stat;
 50:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 53:   cusparsestruct->stream = stream;
 54:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
 55:   return(0);
 56: }

 58: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 59: {
 60:   cusparseStatus_t   stat;
 61:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 64:   if (cusparsestruct->handle != handle) {
 65:     if (cusparsestruct->handle) {
 66:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
 67:     }
 68:     cusparsestruct->handle = handle;
 69:   }
 70:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
 71:   return(0);
 72: }

 74: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 75: {
 76:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 79:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
 80:   return(0);
 81: }

 83: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
 84: {
 86:   *type = MATSOLVERCUSPARSE;
 87:   return(0);
 88: }

 90: /*MC
 91:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 92:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 93:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 94:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 95:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 96:   algorithms are not recommended. This class does NOT support direct solver operations.

 98:   Level: beginner

100: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101: M*/

103: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104: {
106:   PetscInt       n = A->rmap->n;

109:   MatCreate(PetscObjectComm((PetscObject)A),B);
110:   MatSetSizes(*B,n,n,n,n);
111:   (*B)->factortype = ftype;
112:   (*B)->useordering = PETSC_TRUE;
113:   MatSetType(*B,MATSEQAIJCUSPARSE);

115:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
116:     MatSetBlockSizesFromMats(*B,A,A);
117:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
118:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
119:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
120:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
121:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
122:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

124:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
125:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
126:   return(0);
127: }

129: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
130: {
131:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

134:   switch (op) {
135:   case MAT_CUSPARSE_MULT:
136:     cusparsestruct->format = format;
137:     break;
138:   case MAT_CUSPARSE_ALL:
139:     cusparsestruct->format = format;
140:     break;
141:   default:
142:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
143:   }
144:   return(0);
145: }

147: /*@
148:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149:    operation. Only the MatMult operation can use different GPU storage formats
150:    for MPIAIJCUSPARSE matrices.
151:    Not Collective

153:    Input Parameters:
154: +  A - Matrix of type SEQAIJCUSPARSE
155: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
156: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

158:    Output Parameter:

160:    Level: intermediate

162: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163: @*/
164: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165: {

170:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
171:   return(0);
172: }

174: /*@
175:    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose

177:    Collective on mat

179:    Input Parameters:
180: +  A - Matrix of type SEQAIJCUSPARSE
181: -  transgen - the boolean flag

183:    Level: intermediate

185: .seealso: MATSEQAIJCUSPARSE
186: @*/
187: PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
188: {
190:   PetscBool      flg;

194:   PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);
195:   if (flg) {
196:     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;

198:     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
199:     cusp->transgen = transgen;
200:     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
201:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
202:     }
203:   }
204:   return(0);
205: }

207: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
208: {
209:   PetscErrorCode           ierr;
210:   MatCUSPARSEStorageFormat format;
211:   PetscBool                flg;
212:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

215:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
216:   if (A->factortype == MAT_FACTOR_NONE) {
217:     PetscBool transgen = cusparsestruct->transgen;

219:     PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);
220:     if (flg) {
221:       MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);
222:     }
223:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
224:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
225:     if (flg) {
226:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
227:     }
228:   }
229:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
230:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
231:   if (flg) {
232:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
233:   }
234:   PetscOptionsTail();
235:   return(0);
236: }

238: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
239: {

243:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
244:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
245:   return(0);
246: }

248: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
249: {

253:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
254:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
255:   return(0);
256: }

258: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
259: {

263:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
264:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
265:   return(0);
266: }

268: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
269: {

273:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
274:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
275:   return(0);
276: }

278: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
279: {
280:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
281:   PetscInt                          n = A->rmap->n;
282:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
283:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
284:   cusparseStatus_t                  stat;
285:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
286:   const MatScalar                   *aa = a->a,*v;
287:   PetscInt                          *AiLo, *AjLo;
288:   PetscScalar                       *AALo;
289:   PetscInt                          i,nz, nzLower, offset, rowOffset;
290:   PetscErrorCode                    ierr;
291:   cudaError_t                       cerr;

294:   if (!n) return(0);
295:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
296:     try {
297:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
298:       nzLower=n+ai[n]-ai[1];

300:       /* Allocate Space for the lower triangular matrix */
301:       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
302:       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
303:       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

305:       /* Fill the lower triangular matrix */
306:       AiLo[0]  = (PetscInt) 0;
307:       AiLo[n]  = nzLower;
308:       AjLo[0]  = (PetscInt) 0;
309:       AALo[0]  = (MatScalar) 1.0;
310:       v        = aa;
311:       vi       = aj;
312:       offset   = 1;
313:       rowOffset= 1;
314:       for (i=1; i<n; i++) {
315:         nz = ai[i+1] - ai[i];
316:         /* additional 1 for the term on the diagonal */
317:         AiLo[i]    = rowOffset;
318:         rowOffset += nz+1;

320:         PetscArraycpy(&(AjLo[offset]), vi, nz);
321:         PetscArraycpy(&(AALo[offset]), v, nz);

323:         offset      += nz;
324:         AjLo[offset] = (PetscInt) i;
325:         AALo[offset] = (MatScalar) 1.0;
326:         offset      += 1;

328:         v  += nz;
329:         vi += nz;
330:       }

332:       /* allocate space for the triangular factor information */
333:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

335:       /* Create the matrix description */
336:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
337:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
338:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
339:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
340:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

342:       /* Create the solve analysis information */
343:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

345:       /* set the operation */
346:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

348:       /* set the matrix */
349:       loTriFactor->csrMat = new CsrMatrix;
350:       loTriFactor->csrMat->num_rows = n;
351:       loTriFactor->csrMat->num_cols = n;
352:       loTriFactor->csrMat->num_entries = nzLower;

354:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
355:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

357:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
358:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

360:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
361:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

363:       /* perform the solve analysis */
364:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
365:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
366:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
367:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

369:       /* assign the pointer. Is this really necessary? */
370:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

372:       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
373:       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
374:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
375:       PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
376:     } catch(char *ex) {
377:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
378:     }
379:   }
380:   return(0);
381: }

383: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
384: {
385:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
386:   PetscInt                          n = A->rmap->n;
387:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
388:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
389:   cusparseStatus_t                  stat;
390:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
391:   const MatScalar                   *aa = a->a,*v;
392:   PetscInt                          *AiUp, *AjUp;
393:   PetscScalar                       *AAUp;
394:   PetscInt                          i,nz, nzUpper, offset;
395:   PetscErrorCode                    ierr;
396:   cudaError_t                       cerr;

399:   if (!n) return(0);
400:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
401:     try {
402:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
403:       nzUpper = adiag[0]-adiag[n];

405:       /* Allocate Space for the upper triangular matrix */
406:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
407:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
408:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

410:       /* Fill the upper triangular matrix */
411:       AiUp[0]=(PetscInt) 0;
412:       AiUp[n]=nzUpper;
413:       offset = nzUpper;
414:       for (i=n-1; i>=0; i--) {
415:         v  = aa + adiag[i+1] + 1;
416:         vi = aj + adiag[i+1] + 1;

418:         /* number of elements NOT on the diagonal */
419:         nz = adiag[i] - adiag[i+1]-1;

421:         /* decrement the offset */
422:         offset -= (nz+1);

424:         /* first, set the diagonal elements */
425:         AjUp[offset] = (PetscInt) i;
426:         AAUp[offset] = (MatScalar)1./v[nz];
427:         AiUp[i]      = AiUp[i+1] - (nz+1);

429:         PetscArraycpy(&(AjUp[offset+1]), vi, nz);
430:         PetscArraycpy(&(AAUp[offset+1]), v, nz);
431:       }

433:       /* allocate space for the triangular factor information */
434:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

436:       /* Create the matrix description */
437:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
438:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
439:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
440:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
441:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

443:       /* Create the solve analysis information */
444:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

446:       /* set the operation */
447:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

449:       /* set the matrix */
450:       upTriFactor->csrMat = new CsrMatrix;
451:       upTriFactor->csrMat->num_rows = n;
452:       upTriFactor->csrMat->num_cols = n;
453:       upTriFactor->csrMat->num_entries = nzUpper;

455:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
456:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

458:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
459:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

461:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
462:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

464:       /* perform the solve analysis */
465:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
466:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
467:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
468:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

470:       /* assign the pointer. Is this really necessary? */
471:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

473:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
474:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
475:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
476:       PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
477:     } catch(char *ex) {
478:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
479:     }
480:   }
481:   return(0);
482: }

484: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
485: {
486:   PetscErrorCode               ierr;
487:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
488:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
489:   IS                           isrow = a->row,iscol = a->icol;
490:   PetscBool                    row_identity,col_identity;
491:   const PetscInt               *r,*c;
492:   PetscInt                     n = A->rmap->n;

495:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
496:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
497:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

499:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
500:   cusparseTriFactors->nnz=a->nz;

502:   A->offloadmask = PETSC_OFFLOAD_BOTH;
503:   /* lower triangular indices */
504:   ISGetIndices(isrow,&r);
505:   ISIdentity(isrow,&row_identity);
506:   if (!row_identity) {
507:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
508:     cusparseTriFactors->rpermIndices->assign(r, r+n);
509:   }
510:   ISRestoreIndices(isrow,&r);

512:   /* upper triangular indices */
513:   ISGetIndices(iscol,&c);
514:   ISIdentity(iscol,&col_identity);
515:   if (!col_identity) {
516:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
517:     cusparseTriFactors->cpermIndices->assign(c, c+n);
518:   }

520:   if (!row_identity && !col_identity) {
521:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
522:   } else if(!row_identity) {
523:     PetscLogCpuToGpu(n*sizeof(PetscInt));
524:   } else if(!col_identity) {
525:     PetscLogCpuToGpu(n*sizeof(PetscInt));
526:   }

528:   ISRestoreIndices(iscol,&c);
529:   return(0);
530: }

532: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
533: {
534:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
535:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
536:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
537:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
538:   cusparseStatus_t                  stat;
539:   PetscErrorCode                    ierr;
540:   cudaError_t                       cerr;
541:   PetscInt                          *AiUp, *AjUp;
542:   PetscScalar                       *AAUp;
543:   PetscScalar                       *AALo;
544:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
545:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
546:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
547:   const MatScalar                   *aa = b->a,*v;

550:   if (!n) return(0);
551:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
552:     try {
553:       /* Allocate Space for the upper triangular matrix */
554:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
555:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
556:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
557:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

559:       /* Fill the upper triangular matrix */
560:       AiUp[0]=(PetscInt) 0;
561:       AiUp[n]=nzUpper;
562:       offset = 0;
563:       for (i=0; i<n; i++) {
564:         /* set the pointers */
565:         v  = aa + ai[i];
566:         vj = aj + ai[i];
567:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

569:         /* first, set the diagonal elements */
570:         AjUp[offset] = (PetscInt) i;
571:         AAUp[offset] = (MatScalar)1.0/v[nz];
572:         AiUp[i]      = offset;
573:         AALo[offset] = (MatScalar)1.0/v[nz];

575:         offset+=1;
576:         if (nz>0) {
577:           PetscArraycpy(&(AjUp[offset]), vj, nz);
578:           PetscArraycpy(&(AAUp[offset]), v, nz);
579:           for (j=offset; j<offset+nz; j++) {
580:             AAUp[j] = -AAUp[j];
581:             AALo[j] = AAUp[j]/v[nz];
582:           }
583:           offset+=nz;
584:         }
585:       }

587:       /* allocate space for the triangular factor information */
588:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

590:       /* Create the matrix description */
591:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
592:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
593:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
594:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
595:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

597:       /* Create the solve analysis information */
598:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

600:       /* set the operation */
601:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

603:       /* set the matrix */
604:       upTriFactor->csrMat = new CsrMatrix;
605:       upTriFactor->csrMat->num_rows = A->rmap->n;
606:       upTriFactor->csrMat->num_cols = A->cmap->n;
607:       upTriFactor->csrMat->num_entries = a->nz;

609:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
610:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

612:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
613:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

615:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
616:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

618:       /* perform the solve analysis */
619:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
620:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
621:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
622:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

624:       /* assign the pointer. Is this really necessary? */
625:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

627:       /* allocate space for the triangular factor information */
628:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

630:       /* Create the matrix description */
631:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
632:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
633:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
634:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
635:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

637:       /* Create the solve analysis information */
638:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

640:       /* set the operation */
641:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

643:       /* set the matrix */
644:       loTriFactor->csrMat = new CsrMatrix;
645:       loTriFactor->csrMat->num_rows = A->rmap->n;
646:       loTriFactor->csrMat->num_cols = A->cmap->n;
647:       loTriFactor->csrMat->num_entries = a->nz;

649:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
650:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

652:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
653:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

655:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
656:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
657:       PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));

659:       /* perform the solve analysis */
660:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
661:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
662:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
663:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

665:       /* assign the pointer. Is this really necessary? */
666:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

668:       A->offloadmask = PETSC_OFFLOAD_BOTH;
669:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
670:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
671:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
672:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
673:     } catch(char *ex) {
674:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
675:     }
676:   }
677:   return(0);
678: }

680: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
681: {
682:   PetscErrorCode               ierr;
683:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
684:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
685:   IS                           ip = a->row;
686:   const PetscInt               *rip;
687:   PetscBool                    perm_identity;
688:   PetscInt                     n = A->rmap->n;

691:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
692:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
693:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
694:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

696:   /* lower triangular indices */
697:   ISGetIndices(ip,&rip);
698:   ISIdentity(ip,&perm_identity);
699:   if (!perm_identity) {
700:     IS             iip;
701:     const PetscInt *irip;

703:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
704:     ISGetIndices(iip,&irip);
705:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
706:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
707:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
708:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
709:     ISRestoreIndices(iip,&irip);
710:     ISDestroy(&iip);
711:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
712:   }
713:   ISRestoreIndices(ip,&rip);
714:   return(0);
715: }

717: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
718: {
719:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
720:   IS             isrow = b->row,iscol = b->col;
721:   PetscBool      row_identity,col_identity;

725:   MatLUFactorNumeric_SeqAIJ(B,A,info);
726:   B->offloadmask = PETSC_OFFLOAD_CPU;
727:   /* determine which version of MatSolve needs to be used. */
728:   ISIdentity(isrow,&row_identity);
729:   ISIdentity(iscol,&col_identity);
730:   if (row_identity && col_identity) {
731:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
732:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
733:     B->ops->matsolve = NULL;
734:     B->ops->matsolvetranspose = NULL;
735:   } else {
736:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
737:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
738:     B->ops->matsolve = NULL;
739:     B->ops->matsolvetranspose = NULL;
740:   }

742:   /* get the triangular factors */
743:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
744:   return(0);
745: }

747: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
748: {
749:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
750:   IS             ip = b->row;
751:   PetscBool      perm_identity;

755:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
756:   B->offloadmask = PETSC_OFFLOAD_CPU;
757:   /* determine which version of MatSolve needs to be used. */
758:   ISIdentity(ip,&perm_identity);
759:   if (perm_identity) {
760:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
761:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
762:     B->ops->matsolve = NULL;
763:     B->ops->matsolvetranspose = NULL;
764:   } else {
765:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
766:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
767:     B->ops->matsolve = NULL;
768:     B->ops->matsolvetranspose = NULL;
769:   }

771:   /* get the triangular factors */
772:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
773:   return(0);
774: }

776: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
777: {
778:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
779:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
780:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
781:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
782:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
783:   cusparseStatus_t                  stat;
784:   cusparseIndexBase_t               indexBase;
785:   cusparseMatrixType_t              matrixType;
786:   cusparseFillMode_t                fillMode;
787:   cusparseDiagType_t                diagType;


791:   /*********************************************/
792:   /* Now the Transpose of the Lower Tri Factor */
793:   /*********************************************/

795:   /* allocate space for the transpose of the lower triangular factor */
796:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

798:   /* set the matrix descriptors of the lower triangular factor */
799:   matrixType = cusparseGetMatType(loTriFactor->descr);
800:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
801:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
802:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
803:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

805:   /* Create the matrix description */
806:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
807:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
808:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
809:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
810:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

812:   /* Create the solve analysis information */
813:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

815:   /* set the operation */
816:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

818:   /* allocate GPU space for the CSC of the lower triangular factor*/
819:   loTriFactorT->csrMat = new CsrMatrix;
820:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
821:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
822:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
823:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
824:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
825:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

827:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
828:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
829:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
830:                           loTriFactor->csrMat->values->data().get(),
831:                           loTriFactor->csrMat->row_offsets->data().get(),
832:                           loTriFactor->csrMat->column_indices->data().get(),
833:                           loTriFactorT->csrMat->values->data().get(),
834:                           loTriFactorT->csrMat->column_indices->data().get(),
835:                           loTriFactorT->csrMat->row_offsets->data().get(),
836:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

838:   /* perform the solve analysis on the transposed matrix */
839:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
840:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
841:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
842:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
843:                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

845:   /* assign the pointer. Is this really necessary? */
846:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

848:   /*********************************************/
849:   /* Now the Transpose of the Upper Tri Factor */
850:   /*********************************************/

852:   /* allocate space for the transpose of the upper triangular factor */
853:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

855:   /* set the matrix descriptors of the upper triangular factor */
856:   matrixType = cusparseGetMatType(upTriFactor->descr);
857:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
858:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
859:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
860:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

862:   /* Create the matrix description */
863:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
864:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
865:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
866:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
867:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

869:   /* Create the solve analysis information */
870:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

872:   /* set the operation */
873:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

875:   /* allocate GPU space for the CSC of the upper triangular factor*/
876:   upTriFactorT->csrMat = new CsrMatrix;
877:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
878:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
879:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
880:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
881:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
882:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

884:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
885:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
886:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
887:                           upTriFactor->csrMat->values->data().get(),
888:                           upTriFactor->csrMat->row_offsets->data().get(),
889:                           upTriFactor->csrMat->column_indices->data().get(),
890:                           upTriFactorT->csrMat->values->data().get(),
891:                           upTriFactorT->csrMat->column_indices->data().get(),
892:                           upTriFactorT->csrMat->row_offsets->data().get(),
893:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

895:   /* perform the solve analysis on the transposed matrix */
896:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
897:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
898:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
899:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
900:                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

902:   /* assign the pointer. Is this really necessary? */
903:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
904:   return(0);
905: }

907: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
908: {
909:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
910:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
911:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
912:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
913:   cusparseStatus_t             stat;
914:   cusparseIndexBase_t          indexBase;
915:   cudaError_t                  err;
916:   PetscErrorCode               ierr;

919:   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) return(0);
920:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
921:   PetscLogGpuTimeBegin();
922:   /* create cusparse matrix */
923:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
924:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
925:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
926:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
927:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

929:   /* set alpha and beta */
930:   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
931:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
932:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
933:   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
934:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
935:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
936:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

938:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
939:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
940:     CsrMatrix *matrixT= new CsrMatrix;
941:     matrixT->num_rows = A->cmap->n;
942:     matrixT->num_cols = A->rmap->n;
943:     matrixT->num_entries = a->nz;
944:     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
945:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
946:     matrixT->values = new THRUSTARRAY(a->nz);

948:     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
949:     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
950:     /* compute the transpose, i.e. the CSC */
951:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
952:     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
953:                             A->cmap->n, matrix->num_entries,
954:                             matrix->values->data().get(),
955:                             cusparsestruct->rowoffsets_gpu->data().get(),
956:                             matrix->column_indices->data().get(),
957:                             matrixT->values->data().get(),
958:                             matrixT->column_indices->data().get(),
959:                             matrixT->row_offsets->data().get(),
960:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
961:     matstructT->mat = matrixT;
962:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
963:     CsrMatrix *temp = new CsrMatrix;
964:     CsrMatrix *tempT = new CsrMatrix;

966:     /* First convert HYB to CSR */
967:     temp->num_rows = A->rmap->n;
968:     temp->num_cols = A->cmap->n;
969:     temp->num_entries = a->nz;
970:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
971:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
972:     temp->values = new THRUSTARRAY(a->nz);

974:     stat = cusparse_hyb2csr(cusparsestruct->handle,
975:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
976:                             temp->values->data().get(),
977:                             temp->row_offsets->data().get(),
978:                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);

980:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
981:     tempT->num_rows = A->rmap->n;
982:     tempT->num_cols = A->cmap->n;
983:     tempT->num_entries = a->nz;
984:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
985:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
986:     tempT->values = new THRUSTARRAY(a->nz);

988:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
989:                             temp->num_cols, temp->num_entries,
990:                             temp->values->data().get(),
991:                             temp->row_offsets->data().get(),
992:                             temp->column_indices->data().get(),
993:                             tempT->values->data().get(),
994:                             tempT->column_indices->data().get(),
995:                             tempT->row_offsets->data().get(),
996:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

998:     /* Last, convert CSC to HYB */
999:     cusparseHybMat_t hybMat;
1000:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1001:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1002:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1003:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1004:                             matstructT->descr, tempT->values->data().get(),
1005:                             tempT->row_offsets->data().get(),
1006:                             tempT->column_indices->data().get(),
1007:                             hybMat, 0, partition);CHKERRCUSPARSE(stat);

1009:     /* assign the pointer */
1010:     matstructT->mat = hybMat;
1011:     /* delete temporaries */
1012:     if (tempT) {
1013:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1014:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1015:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1016:       delete (CsrMatrix*) tempT;
1017:     }
1018:     if (temp) {
1019:       if (temp->values) delete (THRUSTARRAY*) temp->values;
1020:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1021:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1022:       delete (CsrMatrix*) temp;
1023:     }
1024:   }
1025:   err  = WaitForGPU();CHKERRCUDA(err);
1026:   PetscLogGpuTimeEnd();
1027:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1028:   /* the compressed row indices is not used for matTranspose */
1029:   matstructT->cprowIndices = NULL;
1030:   /* assign the pointer */
1031:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1032:   return(0);
1033: }

1035: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1036: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1037: {
1038:   PetscInt                              n = xx->map->n;
1039:   const PetscScalar                     *barray;
1040:   PetscScalar                           *xarray;
1041:   thrust::device_ptr<const PetscScalar> bGPU;
1042:   thrust::device_ptr<PetscScalar>       xGPU;
1043:   cusparseStatus_t                      stat;
1044:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1045:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1046:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1047:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1048:   PetscErrorCode                        ierr;
1049:   cudaError_t                           cerr;

1052:   /* Analyze the matrix and create the transpose ... on the fly */
1053:   if (!loTriFactorT && !upTriFactorT) {
1054:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1055:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1056:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1057:   }

1059:   /* Get the GPU pointers */
1060:   VecCUDAGetArrayWrite(xx,&xarray);
1061:   VecCUDAGetArrayRead(bb,&barray);
1062:   xGPU = thrust::device_pointer_cast(xarray);
1063:   bGPU = thrust::device_pointer_cast(barray);

1065:   PetscLogGpuTimeBegin();
1066:   /* First, reorder with the row permutation */
1067:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1068:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1069:                xGPU);

1071:   /* First, solve U */
1072:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1073:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1074:                         upTriFactorT->csrMat->values->data().get(),
1075:                         upTriFactorT->csrMat->row_offsets->data().get(),
1076:                         upTriFactorT->csrMat->column_indices->data().get(),
1077:                         upTriFactorT->solveInfo,
1078:                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1080:   /* Then, solve L */
1081:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1083:                         loTriFactorT->csrMat->values->data().get(),
1084:                         loTriFactorT->csrMat->row_offsets->data().get(),
1085:                         loTriFactorT->csrMat->column_indices->data().get(),
1086:                         loTriFactorT->solveInfo,
1087:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1089:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1090:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1091:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1092:                tempGPU->begin());

1094:   /* Copy the temporary to the full solution. */
1095:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1097:   /* restore */
1098:   VecCUDARestoreArrayRead(bb,&barray);
1099:   VecCUDARestoreArrayWrite(xx,&xarray);
1100:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1101:   PetscLogGpuTimeEnd();
1102:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1103:   return(0);
1104: }

1106: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1107: {
1108:   const PetscScalar                 *barray;
1109:   PetscScalar                       *xarray;
1110:   cusparseStatus_t                  stat;
1111:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1112:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1113:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1114:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1115:   PetscErrorCode                    ierr;
1116:   cudaError_t                       cerr;

1119:   /* Analyze the matrix and create the transpose ... on the fly */
1120:   if (!loTriFactorT && !upTriFactorT) {
1121:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1122:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1123:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1124:   }

1126:   /* Get the GPU pointers */
1127:   VecCUDAGetArrayWrite(xx,&xarray);
1128:   VecCUDAGetArrayRead(bb,&barray);

1130:   PetscLogGpuTimeBegin();
1131:   /* First, solve U */
1132:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1133:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1134:                         upTriFactorT->csrMat->values->data().get(),
1135:                         upTriFactorT->csrMat->row_offsets->data().get(),
1136:                         upTriFactorT->csrMat->column_indices->data().get(),
1137:                         upTriFactorT->solveInfo,
1138:                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1140:   /* Then, solve L */
1141:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1142:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1143:                         loTriFactorT->csrMat->values->data().get(),
1144:                         loTriFactorT->csrMat->row_offsets->data().get(),
1145:                         loTriFactorT->csrMat->column_indices->data().get(),
1146:                         loTriFactorT->solveInfo,
1147:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1149:   /* restore */
1150:   VecCUDARestoreArrayRead(bb,&barray);
1151:   VecCUDARestoreArrayWrite(xx,&xarray);
1152:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1153:   PetscLogGpuTimeEnd();
1154:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1155:   return(0);
1156: }

1158: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1159: {
1160:   const PetscScalar                     *barray;
1161:   PetscScalar                           *xarray;
1162:   thrust::device_ptr<const PetscScalar> bGPU;
1163:   thrust::device_ptr<PetscScalar>       xGPU;
1164:   cusparseStatus_t                      stat;
1165:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1166:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1167:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1168:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1169:   PetscErrorCode                        ierr;
1170:   cudaError_t                           cerr;


1174:   /* Get the GPU pointers */
1175:   VecCUDAGetArrayWrite(xx,&xarray);
1176:   VecCUDAGetArrayRead(bb,&barray);
1177:   xGPU = thrust::device_pointer_cast(xarray);
1178:   bGPU = thrust::device_pointer_cast(barray);

1180:   PetscLogGpuTimeBegin();
1181:   /* First, reorder with the row permutation */
1182:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1183:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1184:                tempGPU->begin());

1186:   /* Next, solve L */
1187:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1188:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1189:                         loTriFactor->csrMat->values->data().get(),
1190:                         loTriFactor->csrMat->row_offsets->data().get(),
1191:                         loTriFactor->csrMat->column_indices->data().get(),
1192:                         loTriFactor->solveInfo,
1193:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1195:   /* Then, solve U */
1196:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1197:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1198:                         upTriFactor->csrMat->values->data().get(),
1199:                         upTriFactor->csrMat->row_offsets->data().get(),
1200:                         upTriFactor->csrMat->column_indices->data().get(),
1201:                         upTriFactor->solveInfo,
1202:                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1204:   /* Last, reorder with the column permutation */
1205:   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1206:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1207:                xGPU);

1209:   VecCUDARestoreArrayRead(bb,&barray);
1210:   VecCUDARestoreArrayWrite(xx,&xarray);
1211:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1212:   PetscLogGpuTimeEnd();
1213:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1214:   return(0);
1215: }

1217: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1218: {
1219:   const PetscScalar                 *barray;
1220:   PetscScalar                       *xarray;
1221:   cusparseStatus_t                  stat;
1222:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1223:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1224:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1225:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1226:   PetscErrorCode                    ierr;
1227:   cudaError_t                       cerr;

1230:   /* Get the GPU pointers */
1231:   VecCUDAGetArrayWrite(xx,&xarray);
1232:   VecCUDAGetArrayRead(bb,&barray);

1234:   PetscLogGpuTimeBegin();
1235:   /* First, solve L */
1236:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1237:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1238:                         loTriFactor->csrMat->values->data().get(),
1239:                         loTriFactor->csrMat->row_offsets->data().get(),
1240:                         loTriFactor->csrMat->column_indices->data().get(),
1241:                         loTriFactor->solveInfo,
1242:                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1244:   /* Next, solve U */
1245:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1246:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1247:                         upTriFactor->csrMat->values->data().get(),
1248:                         upTriFactor->csrMat->row_offsets->data().get(),
1249:                         upTriFactor->csrMat->column_indices->data().get(),
1250:                         upTriFactor->solveInfo,
1251:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1253:   VecCUDARestoreArrayRead(bb,&barray);
1254:   VecCUDARestoreArrayWrite(xx,&xarray);
1255:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1256:   PetscLogGpuTimeEnd();
1257:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1258:   return(0);
1259: }

1261: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1262: {
1263:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1264:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1265:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1266:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1267:   PetscErrorCode               ierr;
1268:   cusparseStatus_t             stat;
1269:   cudaError_t                  err;

1272:   if (A->boundtocpu) return(0);
1273:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1274:     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1275:       /* Copy values only */
1276:       CsrMatrix *mat,*matT;
1277:       mat  = (CsrMatrix*)cusparsestruct->mat->mat;

1279:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1280:       PetscLogGpuTimeBegin();
1281:       mat->values->assign(a->a, a->a+a->nz);
1282:       err  = WaitForGPU();CHKERRCUDA(err);
1283:       PetscLogGpuTimeEnd();
1284:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1285:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);

1287:       /* Update matT when it was built before */
1288:       if (cusparsestruct->matTranspose) {
1289:         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1290:         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1291:         PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1292:         PetscLogGpuTimeBegin();
1293:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1294:                                  A->cmap->n, mat->num_entries,
1295:                                  mat->values->data().get(),
1296:                                  cusparsestruct->rowoffsets_gpu->data().get(),
1297:                                  mat->column_indices->data().get(),
1298:                                  matT->values->data().get(),
1299:                                  matT->column_indices->data().get(),
1300:                                  matT->row_offsets->data().get(),
1301:                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1302:         err  = WaitForGPU();CHKERRCUDA(err);
1303:         PetscLogGpuTimeEnd();
1304:         PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1305:       }
1306:     } else {
1307:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1308:       PetscLogGpuTimeBegin();
1309:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1310:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1311:       delete cusparsestruct->workVector;
1312:       delete cusparsestruct->rowoffsets_gpu;
1313:       try {
1314:         if (a->compressedrow.use) {
1315:           m    = a->compressedrow.nrows;
1316:           ii   = a->compressedrow.i;
1317:           ridx = a->compressedrow.rindex;
1318:         } else {
1319:           m    = A->rmap->n;
1320:           ii   = a->i;
1321:           ridx = NULL;
1322:         }
1323:         cusparsestruct->nrows = m;

1325:         /* create cusparse matrix */
1326:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1327:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1328:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1329:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1331:         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1332:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1333:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1334:         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1335:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1336:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1337:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1339:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1340:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1341:           /* set the matrix */
1342:           CsrMatrix *matrix= new CsrMatrix;
1343:           matrix->num_rows = m;
1344:           matrix->num_cols = A->cmap->n;
1345:           matrix->num_entries = a->nz;
1346:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1347:           matrix->row_offsets->assign(ii, ii + m+1);

1349:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1350:           matrix->column_indices->assign(a->j, a->j+a->nz);

1352:           matrix->values = new THRUSTARRAY(a->nz);
1353:           matrix->values->assign(a->a, a->a+a->nz);

1355:           /* assign the pointer */
1356:           matstruct->mat = matrix;

1358:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1359:           CsrMatrix *matrix= new CsrMatrix;
1360:           matrix->num_rows = m;
1361:           matrix->num_cols = A->cmap->n;
1362:           matrix->num_entries = a->nz;
1363:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1364:           matrix->row_offsets->assign(ii, ii + m+1);

1366:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1367:           matrix->column_indices->assign(a->j, a->j+a->nz);

1369:           matrix->values = new THRUSTARRAY(a->nz);
1370:           matrix->values->assign(a->a, a->a+a->nz);

1372:           cusparseHybMat_t hybMat;
1373:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1374:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1375:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1376:           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1377:               matstruct->descr, matrix->values->data().get(),
1378:               matrix->row_offsets->data().get(),
1379:               matrix->column_indices->data().get(),
1380:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1381:           /* assign the pointer */
1382:           matstruct->mat = hybMat;

1384:           if (matrix) {
1385:             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1386:             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1387:             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1388:             delete (CsrMatrix*)matrix;
1389:           }
1390:         }

1392:         /* assign the compressed row indices */
1393:         if (a->compressedrow.use) {
1394:           cusparsestruct->workVector = new THRUSTARRAY(m);
1395:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1396:           matstruct->cprowIndices->assign(ridx,ridx+m);
1397:           tmp = m;
1398:         } else {
1399:           cusparsestruct->workVector = NULL;
1400:           matstruct->cprowIndices    = NULL;
1401:           tmp = 0;
1402:         }
1403:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1405:         /* assign the pointer */
1406:         cusparsestruct->mat = matstruct;
1407:       } catch(char *ex) {
1408:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1409:       }
1410:       err  = WaitForGPU();CHKERRCUDA(err);
1411:       PetscLogGpuTimeEnd();
1412:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1413:       cusparsestruct->nonzerostate = A->nonzerostate;
1414:     }
1415:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1416:   }
1417:   return(0);
1418: }

1420: struct VecCUDAPlusEquals
1421: {
1422:   template <typename Tuple>
1423:   __host__ __device__
1424:   void operator()(Tuple t)
1425:   {
1426:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1427:   }
1428: };

1430: struct VecCUDAEqualsReverse
1431: {
1432:   template <typename Tuple>
1433:   __host__ __device__
1434:   void operator()(Tuple t)
1435:   {
1436:     thrust::get<0>(t) = thrust::get<1>(t);
1437:   }
1438: };

1440: typedef struct {
1441:   PetscBool   cisdense;
1442:   PetscScalar *Bt;
1443:   Mat         X;
1444: } MatMatCusparse;

1446: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1447: {
1449:   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1450:   cudaError_t    cerr;

1453:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1454:   MatDestroy(&mmdata->X);
1455:   PetscFree(data);
1456:   return(0);
1457: }

1459: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

1461: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1462: {
1463:   Mat_Product                  *product = C->product;
1464:   Mat                          A,B;
1465:   PetscInt                     m,n,k,blda,clda;
1466:   PetscBool                    flg,biscuda;
1467:   Mat_SeqAIJCUSPARSE           *cusp;
1468:   cusparseStatus_t             stat;
1469:   cusparseOperation_t          opA;
1470:   cusparseMatDescr_t           matA;
1471:   const PetscScalar            *barray;
1472:   PetscScalar                  *carray;
1473:   PetscErrorCode               ierr;
1474:   MatMatCusparse               *mmdata;
1475:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1476:   CsrMatrix                    *csrmat;
1477:   cudaError_t                  cuer;

1480:   MatCheckProduct(C,1);
1481:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1482:   mmdata = (MatMatCusparse*)product->data;
1483:   A    = product->A;
1484:   B    = product->B;
1485:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1486:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1487:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1488:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1489:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1490:   MatSeqAIJCUSPARSECopyToGPU(A);
1491:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1492:   switch (product->type) {
1493:   case MATPRODUCT_AB:
1494:   case MATPRODUCT_PtAP:
1495:     mat = cusp->mat;
1496:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1497:     m   = A->rmap->n;
1498:     k   = A->cmap->n;
1499:     n   = B->cmap->n;
1500:     break;
1501:   case MATPRODUCT_AtB:
1502:     if (!cusp->transgen) {
1503:       mat = cusp->mat;
1504:       opA = CUSPARSE_OPERATION_TRANSPOSE;
1505:     } else {
1506:       MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1507:       mat  = cusp->matTranspose;
1508:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1509:     }
1510:     m = A->cmap->n;
1511:     k = A->rmap->n;
1512:     n = B->cmap->n;
1513:     break;
1514:   case MATPRODUCT_ABt:
1515:   case MATPRODUCT_RARt:
1516:     mat = cusp->mat;
1517:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1518:     m   = A->rmap->n;
1519:     k   = B->cmap->n;
1520:     n   = B->rmap->n;
1521:     break;
1522:   default:
1523:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1524:   }
1525:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1526:   matA   = mat->descr;
1527:   csrmat = (CsrMatrix*)mat->mat;
1528:   /* if the user passed a CPU matrix, copy the data to the GPU */
1529:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
1530:   if (!biscuda) {
1531:     MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1532:   }
1533:   MatDenseCUDAGetArrayRead(B,&barray);
1534:   MatDenseGetLDA(B,&blda);
1535:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1536:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
1537:     MatDenseGetLDA(mmdata->X,&clda);
1538:   } else {
1539:     MatDenseCUDAGetArrayWrite(C,&carray);
1540:     MatDenseGetLDA(C,&clda);
1541:   }

1543:   PetscLogGpuTimeBegin();

1545:   /* cusparse spmm does not support transpose on B */
1546:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1547:     cublasHandle_t cublasv2handle;
1548:     cublasStatus_t cerr;

1550:     PetscCUBLASGetHandle(&cublasv2handle);
1551:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1552:                        B->cmap->n,B->rmap->n,
1553:                        &PETSC_CUSPARSE_ONE ,barray,blda,
1554:                        &PETSC_CUSPARSE_ZERO,barray,blda,
1555:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1556:     blda = B->cmap->n;
1557:   }

1559:   /* perform the MatMat operation */
1560:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1561:                            csrmat->num_entries,mat->alpha,matA,
1562:                            csrmat->values->data().get(),
1563:                            csrmat->row_offsets->data().get(),
1564:                            csrmat->column_indices->data().get(),
1565:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1566:                            carray,clda);CHKERRCUSPARSE(stat);
1567:   cuer = WaitForGPU();CHKERRCUDA(cuer);
1568:   PetscLogGpuTimeEnd();
1569:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
1570:   MatDenseCUDARestoreArrayRead(B,&barray);
1571:   if (product->type == MATPRODUCT_RARt) {
1572:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1573:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
1574:   } else if (product->type == MATPRODUCT_PtAP) {
1575:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1576:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
1577:   } else {
1578:     MatDenseCUDARestoreArrayWrite(C,&carray);
1579:   }
1580:   if (mmdata->cisdense) {
1581:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
1582:   }
1583:   if (!biscuda) {
1584:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1585:   }
1586:   return(0);
1587: }

1589: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1590: {
1591:   Mat_Product        *product = C->product;
1592:   Mat                A,B;
1593:   PetscInt           m,n;
1594:   PetscBool          cisdense,flg;
1595:   PetscErrorCode     ierr;
1596:   MatMatCusparse     *mmdata;
1597:   Mat_SeqAIJCUSPARSE *cusp;

1600:   MatCheckProduct(C,1);
1601:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1602:   A    = product->A;
1603:   B    = product->B;
1604:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1605:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1606:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1607:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1608:   switch (product->type) {
1609:   case MATPRODUCT_AB:
1610:     m = A->rmap->n;
1611:     n = B->cmap->n;
1612:     break;
1613:   case MATPRODUCT_AtB:
1614:     m = A->cmap->n;
1615:     n = B->cmap->n;
1616:     break;
1617:   case MATPRODUCT_ABt:
1618:     m = A->rmap->n;
1619:     n = B->rmap->n;
1620:     break;
1621:   case MATPRODUCT_PtAP:
1622:     m = B->cmap->n;
1623:     n = B->cmap->n;
1624:     break;
1625:   case MATPRODUCT_RARt:
1626:     m = B->rmap->n;
1627:     n = B->rmap->n;
1628:     break;
1629:   default:
1630:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1631:   }
1632:   MatSetSizes(C,m,n,m,n);
1633:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1634:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
1635:   MatSetType(C,MATSEQDENSECUDA);

1637:   /* product data */
1638:   PetscNew(&mmdata);
1639:   mmdata->cisdense = cisdense;
1640:   /* cusparse spmm does not support transpose on B */
1641:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1642:     cudaError_t cerr;

1644:     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1645:   }
1646:   /* for these products we need intermediate storage */
1647:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1648:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
1649:     MatSetType(mmdata->X,MATSEQDENSECUDA);
1650:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1651:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
1652:     } else {
1653:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
1654:     }
1655:   }
1656:   C->product->data    = mmdata;
1657:   C->product->destroy = MatDestroy_MatMatCusparse;

1659:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1660:   return(0);
1661: }

1663: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

1665: /* handles dense B */
1666: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1667: {
1668:   Mat_Product    *product = C->product;

1672:   MatCheckProduct(C,1);
1673:   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1674:   if (product->A->boundtocpu) {
1675:     MatProductSetFromOptions_SeqAIJ_SeqDense(C);
1676:     return(0);
1677:   }
1678:   switch (product->type) {
1679:   case MATPRODUCT_AB:
1680:   case MATPRODUCT_AtB:
1681:   case MATPRODUCT_ABt:
1682:   case MATPRODUCT_PtAP:
1683:   case MATPRODUCT_RARt:
1684:     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1685:   default:
1686:     break;
1687:   }
1688:   return(0);
1689: }

1691: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1692: {

1696:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
1697:   return(0);
1698: }

1700: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1701: {

1705:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
1706:   return(0);
1707: }

1709: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1710: {

1714:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
1715:   return(0);
1716: }

1718: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1719: {

1723:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
1724:   return(0);
1725: }

1727: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1728: {

1732:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
1733:   return(0);
1734: }

1736: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
1737: {
1738:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1739:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1740:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1741:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1742:   PetscErrorCode               ierr;
1743:   cudaError_t                  cerr;
1744:   cusparseStatus_t             stat;
1745:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1746:   PetscBool                    compressed;

1749:   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1750:   if (!a->nonzerorowcnt) {
1751:     if (!yy) {
1752:       VecSet_SeqCUDA(zz,0);
1753:     }
1754:     return(0);
1755:   }
1756:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1757:   MatSeqAIJCUSPARSECopyToGPU(A);
1758:   if (!trans) {
1759:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1760:   } else {
1761:     if (herm || !cusparsestruct->transgen) {
1762:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1763:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1764:     } else {
1765:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1766:       if (!matstruct) {
1767:         MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1768:         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1769:       }
1770:     }
1771:   }
1772:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1773:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

1775:   try {
1776:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
1777:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
1778:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */
1779:     PetscLogGpuTimeBegin();
1780:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1781:       xptr = xarray;
1782:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1783:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1784:     } else {
1785:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1786:       dptr = zarray;
1787:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;

1789:       if (compressed) {
1790:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

1792:         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1793:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1794:                          VecCUDAEqualsReverse());
1795:       }
1796:     }

1798:     /* csr_spmv does y = alpha*Ax + beta*y */
1799:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1800:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1801:       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1802:                                mat->num_rows, mat->num_cols,
1803:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1804:                                mat->values->data().get(), mat->row_offsets->data().get(),
1805:                                mat->column_indices->data().get(), xptr, beta,
1806:                                dptr);CHKERRCUSPARSE(stat);
1807:     } else {
1808:       if (cusparsestruct->nrows) {
1809:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1810:         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1811:                                  matstruct->alpha, matstruct->descr, hybMat,
1812:                                  xptr, beta,
1813:                                  dptr);CHKERRCUSPARSE(stat);
1814:       }
1815:     }
1816:     cerr = WaitForGPU();CHKERRCUDA(cerr);
1817:     PetscLogGpuTimeEnd();

1819:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1820:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1821:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1822:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
1823:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1824:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
1825:         }
1826:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1827:         VecSet_SeqCUDA(zz,0);
1828:       }

1830:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1831:       if (compressed) {
1832:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);

1834:         PetscLogGpuTimeBegin();
1835:         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1836:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1837:                          VecCUDAPlusEquals());
1838:         cerr = WaitForGPU();CHKERRCUDA(cerr);
1839:         PetscLogGpuTimeEnd();
1840:       }
1841:     } else {
1842:       if (yy && yy != zz) {
1843:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
1844:       }
1845:     }
1846:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
1847:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
1848:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
1849:   } catch(char *ex) {
1850:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1851:   }
1852:   if (yy) {
1853:     PetscLogGpuFlops(2.0*a->nz);
1854:   } else {
1855:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
1856:   }
1857:   return(0);
1858: }

1860: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1861: {

1865:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
1866:   return(0);
1867: }

1869: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1870: {

1874:   MatAssemblyEnd_SeqAIJ(A,mode);
1875:   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
1876:   if (A->factortype == MAT_FACTOR_NONE) {
1877:     MatSeqAIJCUSPARSECopyToGPU(A);
1878:   }
1879:   return(0);
1880: }

1882: /* --------------------------------------------------------------------------------*/
1883: /*@
1884:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1885:    (the default parallel PETSc format). This matrix will ultimately pushed down
1886:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1887:    assembly performance the user should preallocate the matrix storage by setting
1888:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1889:    performance during matrix assembly can be increased by more than a factor of 50.

1891:    Collective

1893:    Input Parameters:
1894: +  comm - MPI communicator, set to PETSC_COMM_SELF
1895: .  m - number of rows
1896: .  n - number of columns
1897: .  nz - number of nonzeros per row (same for all rows)
1898: -  nnz - array containing the number of nonzeros in the various rows
1899:          (possibly different for each row) or NULL

1901:    Output Parameter:
1902: .  A - the matrix

1904:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1905:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1906:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1908:    Notes:
1909:    If nnz is given then nz is ignored

1911:    The AIJ format (also called the Yale sparse matrix format or
1912:    compressed row storage), is fully compatible with standard Fortran 77
1913:    storage.  That is, the stored row and column indices can begin at
1914:    either one (as in Fortran) or zero.  See the users' manual for details.

1916:    Specify the preallocated storage with either nz or nnz (not both).
1917:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1918:    allocation.  For large problems you MUST preallocate memory or you
1919:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1921:    By default, this format uses inodes (identical nodes) when possible, to
1922:    improve numerical efficiency of matrix-vector products and solves. We
1923:    search for consecutive rows with the same nonzero structure, thereby
1924:    reusing matrix information to achieve increased efficiency.

1926:    Level: intermediate

1928: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1929: @*/
1930: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1931: {

1935:   MatCreate(comm,A);
1936:   MatSetSizes(*A,m,n,m,n);
1937:   MatSetType(*A,MATSEQAIJCUSPARSE);
1938:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1939:   return(0);
1940: }

1942: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1943: {

1947:   if (A->factortype == MAT_FACTOR_NONE) {
1948:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1949:   } else {
1950:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1951:   }
1952:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
1953:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
1954:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
1955:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
1956:   MatDestroy_SeqAIJ(A);
1957:   return(0);
1958: }

1960: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1961: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1962: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1963: {

1967:   MatDuplicate_SeqAIJ(A,cpvalues,B);
1968:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
1969:   return(0);
1970: }

1972: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1973: {

1977:   if (A->factortype != MAT_FACTOR_NONE) return(0);
1978:   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1979:      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1980:      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
1981:      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1982:            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1983:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1984:   /* TODO: add support for this? */
1985:   if (flg) {
1986:     A->ops->mult                      = MatMult_SeqAIJ;
1987:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1988:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1989:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1990:     A->ops->multhermitiantranspose    = NULL;
1991:     A->ops->multhermitiantransposeadd = NULL;
1992:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
1993:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
1994:   } else {
1995:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
1996:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
1997:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
1998:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
1999:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2000:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2001:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2002:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2003:   }
2004:   A->boundtocpu = flg;
2005:   return(0);
2006: }

2008: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2009: {
2010:   PetscErrorCode   ierr;
2011:   cusparseStatus_t stat;
2012:   Mat              B;

2015:   if (reuse == MAT_INITIAL_MATRIX) {
2016:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
2017:   } else if (reuse == MAT_REUSE_MATRIX) {
2018:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
2019:   }
2020:   B = *newmat;

2022:   PetscFree(B->defaultvectype);
2023:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

2025:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2026:     if (B->factortype == MAT_FACTOR_NONE) {
2027:       Mat_SeqAIJCUSPARSE *spptr;

2029:       PetscNew(&spptr);
2030:       spptr->format = MAT_CUSPARSE_CSR;
2031:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2032:       B->spptr = spptr;
2033:     } else {
2034:       Mat_SeqAIJCUSPARSETriFactors *spptr;

2036:       PetscNew(&spptr);
2037:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2038:       B->spptr = spptr;
2039:     }
2040:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2041:   }
2042:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2043:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2044:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2045:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2046:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

2048:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
2049:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
2050:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
2051:   return(0);
2052: }

2054: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2055: {

2059:   if (!PetscCUDAInitialized) {PetscCUDAInitializeLazily();}
2060:   MatCreate_SeqAIJ(B);
2061:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
2062:   return(0);
2063: }

2065: /*MC
2066:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

2068:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2069:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2070:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

2072:    Options Database Keys:
2073: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2074: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2075: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

2077:   Level: beginner

2079: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2080: M*/

2082: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


2085: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2086: {

2090:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
2091:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
2092:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
2093:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
2094:   return(0);
2095: }

2097: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2098: {
2099:   PetscErrorCode   ierr;
2100:   cusparseStatus_t stat;
2101:   cusparseHandle_t handle;

2104:   if (*cusparsestruct) {
2105:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2106:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2107:     delete (*cusparsestruct)->workVector;
2108:     delete (*cusparsestruct)->rowoffsets_gpu;
2109:     if (handle = (*cusparsestruct)->handle) {
2110:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2111:     }
2112:     PetscFree(*cusparsestruct);
2113:   }
2114:   return(0);
2115: }

2117: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2118: {
2120:   if (*mat) {
2121:     delete (*mat)->values;
2122:     delete (*mat)->column_indices;
2123:     delete (*mat)->row_offsets;
2124:     delete *mat;
2125:     *mat = 0;
2126:   }
2127:   return(0);
2128: }

2130: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2131: {
2132:   cusparseStatus_t stat;
2133:   PetscErrorCode   ierr;

2136:   if (*trifactor) {
2137:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2138:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2139:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
2140:     delete *trifactor;
2141:     *trifactor = 0;
2142:   }
2143:   return(0);
2144: }

2146: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2147: {
2148:   CsrMatrix        *mat;
2149:   cusparseStatus_t stat;
2150:   cudaError_t      err;

2153:   if (*matstruct) {
2154:     if ((*matstruct)->mat) {
2155:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2156:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2157:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2158:       } else {
2159:         mat = (CsrMatrix*)(*matstruct)->mat;
2160:         CsrMatrix_Destroy(&mat);
2161:       }
2162:     }
2163:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2164:     delete (*matstruct)->cprowIndices;
2165:     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2166:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2167:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2168:     delete *matstruct;
2169:     *matstruct = 0;
2170:   }
2171:   return(0);
2172: }

2174: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2175: {

2179:   if (*trifactors) {
2180:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2181:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2182:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2183:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2184:     delete (*trifactors)->rpermIndices;
2185:     delete (*trifactors)->cpermIndices;
2186:     delete (*trifactors)->workVector;
2187:     (*trifactors)->rpermIndices = 0;
2188:     (*trifactors)->cpermIndices = 0;
2189:     (*trifactors)->workVector = 0;
2190:   }
2191:   return(0);
2192: }

2194: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2195: {
2196:   PetscErrorCode   ierr;
2197:   cusparseHandle_t handle;
2198:   cusparseStatus_t stat;

2201:   if (*trifactors) {
2202:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2203:     if (handle = (*trifactors)->handle) {
2204:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2205:     }
2206:     PetscFree(*trifactors);
2207:   }
2208:   return(0);
2209: }