Actual source code: aijcusparse.cu

petsc-master 2020-05-31
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10:  #include <../src/mat/impls/aij/seq/aij.h>
 11:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12:  #include <../src/vec/vec/impls/dvecimpl.h>
 13:  #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15:  #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 19: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 20: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 21: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 23: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 24: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 25: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 27: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 28: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 29: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 30: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 31: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 32: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 33: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 34: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 35: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 37: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 38: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 39: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 40: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
 41: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 42: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 44: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 45: {
 46:   cusparseStatus_t   stat;
 47:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 50:   cusparsestruct->stream = stream;
 51:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
 52:   return(0);
 53: }

 55: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 56: {
 57:   cusparseStatus_t   stat;
 58:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 61:   if (cusparsestruct->handle != handle) {
 62:     if (cusparsestruct->handle) {
 63:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
 64:     }
 65:     cusparsestruct->handle = handle;
 66:   }
 67:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
 68:   return(0);
 69: }

 71: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 72: {
 73:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 76:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
 77:   return(0);
 78: }

 80: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
 81: {
 83:   *type = MATSOLVERCUSPARSE;
 84:   return(0);
 85: }

 87: /*MC
 88:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 89:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 90:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 91:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 92:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 93:   algorithms are not recommended. This class does NOT support direct solver operations.

 95:   Level: beginner

 97: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 98: M*/

100: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
101: {
103:   PetscInt       n = A->rmap->n;

106:   MatCreate(PetscObjectComm((PetscObject)A),B);
107:   (*B)->factortype = ftype;
108:   MatSetSizes(*B,n,n,n,n);
109:   MatSetType(*B,MATSEQAIJCUSPARSE);

111:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
112:     MatSetBlockSizesFromMats(*B,A,A);
113:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
114:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
115:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
116:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
117:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
118:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

120:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
121:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
122:   return(0);
123: }

125: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
126: {
127:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

130:   switch (op) {
131:   case MAT_CUSPARSE_MULT:
132:     cusparsestruct->format = format;
133:     break;
134:   case MAT_CUSPARSE_ALL:
135:     cusparsestruct->format = format;
136:     break;
137:   default:
138:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139:   }
140:   return(0);
141: }

143: /*@
144:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
145:    operation. Only the MatMult operation can use different GPU storage formats
146:    for MPIAIJCUSPARSE matrices.
147:    Not Collective

149:    Input Parameters:
150: +  A - Matrix of type SEQAIJCUSPARSE
151: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
152: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

154:    Output Parameter:

156:    Level: intermediate

158: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
159: @*/
160: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
161: {

166:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
167:   return(0);
168: }

170: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
171: {
172:   PetscErrorCode           ierr;
173:   MatCUSPARSEStorageFormat format;
174:   PetscBool                flg;
175:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

178:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
179:   if (A->factortype==MAT_FACTOR_NONE) {
180:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
181:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
182:     if (flg) {
183:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
184:     }
185:   }
186:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
187:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
188:   if (flg) {
189:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
190:   }
191:   PetscOptionsTail();
192:   return(0);
193: }

195: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196: {

200:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
201:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202:   return(0);
203: }

205: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206: {

210:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
211:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212:   return(0);
213: }

215: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216: {

220:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
221:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222:   return(0);
223: }

225: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226: {

230:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
231:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232:   return(0);
233: }

235: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236: {
237:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238:   PetscInt                          n = A->rmap->n;
239:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241:   cusparseStatus_t                  stat;
242:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243:   const MatScalar                   *aa = a->a,*v;
244:   PetscInt                          *AiLo, *AjLo;
245:   PetscScalar                       *AALo;
246:   PetscInt                          i,nz, nzLower, offset, rowOffset;
247:   PetscErrorCode                    ierr;
248:   cudaError_t                       cerr;

251:   if (!n) return(0);
252:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253:     try {
254:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255:       nzLower=n+ai[n]-ai[1];

257:       /* Allocate Space for the lower triangular matrix */
258:       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259:       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260:       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

262:       /* Fill the lower triangular matrix */
263:       AiLo[0]  = (PetscInt) 0;
264:       AiLo[n]  = nzLower;
265:       AjLo[0]  = (PetscInt) 0;
266:       AALo[0]  = (MatScalar) 1.0;
267:       v        = aa;
268:       vi       = aj;
269:       offset   = 1;
270:       rowOffset= 1;
271:       for (i=1; i<n; i++) {
272:         nz = ai[i+1] - ai[i];
273:         /* additional 1 for the term on the diagonal */
274:         AiLo[i]    = rowOffset;
275:         rowOffset += nz+1;

277:         PetscArraycpy(&(AjLo[offset]), vi, nz);
278:         PetscArraycpy(&(AALo[offset]), v, nz);

280:         offset      += nz;
281:         AjLo[offset] = (PetscInt) i;
282:         AALo[offset] = (MatScalar) 1.0;
283:         offset      += 1;

285:         v  += nz;
286:         vi += nz;
287:       }

289:       /* allocate space for the triangular factor information */
290:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

292:       /* Create the matrix description */
293:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

299:       /* Create the solve analysis information */
300:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

302:       /* set the operation */
303:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

305:       /* set the matrix */
306:       loTriFactor->csrMat = new CsrMatrix;
307:       loTriFactor->csrMat->num_rows = n;
308:       loTriFactor->csrMat->num_cols = n;
309:       loTriFactor->csrMat->num_entries = nzLower;

311:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

314:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

317:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

320:       /* perform the solve analysis */
321:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

326:       /* assign the pointer. Is this really necessary? */
327:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

329:       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330:       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332:       PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
333:     } catch(char *ex) {
334:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335:     }
336:   }
337:   return(0);
338: }

340: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341: {
342:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343:   PetscInt                          n = A->rmap->n;
344:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346:   cusparseStatus_t                  stat;
347:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348:   const MatScalar                   *aa = a->a,*v;
349:   PetscInt                          *AiUp, *AjUp;
350:   PetscScalar                       *AAUp;
351:   PetscInt                          i,nz, nzUpper, offset;
352:   PetscErrorCode                    ierr;
353:   cudaError_t                       cerr;

356:   if (!n) return(0);
357:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358:     try {
359:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
360:       nzUpper = adiag[0]-adiag[n];

362:       /* Allocate Space for the upper triangular matrix */
363:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

367:       /* Fill the upper triangular matrix */
368:       AiUp[0]=(PetscInt) 0;
369:       AiUp[n]=nzUpper;
370:       offset = nzUpper;
371:       for (i=n-1; i>=0; i--) {
372:         v  = aa + adiag[i+1] + 1;
373:         vi = aj + adiag[i+1] + 1;

375:         /* number of elements NOT on the diagonal */
376:         nz = adiag[i] - adiag[i+1]-1;

378:         /* decrement the offset */
379:         offset -= (nz+1);

381:         /* first, set the diagonal elements */
382:         AjUp[offset] = (PetscInt) i;
383:         AAUp[offset] = (MatScalar)1./v[nz];
384:         AiUp[i]      = AiUp[i+1] - (nz+1);

386:         PetscArraycpy(&(AjUp[offset+1]), vi, nz);
387:         PetscArraycpy(&(AAUp[offset+1]), v, nz);
388:       }

390:       /* allocate space for the triangular factor information */
391:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

393:       /* Create the matrix description */
394:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

400:       /* Create the solve analysis information */
401:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

403:       /* set the operation */
404:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

406:       /* set the matrix */
407:       upTriFactor->csrMat = new CsrMatrix;
408:       upTriFactor->csrMat->num_rows = n;
409:       upTriFactor->csrMat->num_cols = n;
410:       upTriFactor->csrMat->num_entries = nzUpper;

412:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

415:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

418:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

421:       /* perform the solve analysis */
422:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

427:       /* assign the pointer. Is this really necessary? */
428:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

430:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433:       PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
434:     } catch(char *ex) {
435:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436:     }
437:   }
438:   return(0);
439: }

441: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442: {
443:   PetscErrorCode               ierr;
444:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
445:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446:   IS                           isrow = a->row,iscol = a->icol;
447:   PetscBool                    row_identity,col_identity;
448:   const PetscInt               *r,*c;
449:   PetscInt                     n = A->rmap->n;

452:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
453:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
454:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

456:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
457:   cusparseTriFactors->nnz=a->nz;

459:   A->offloadmask = PETSC_OFFLOAD_BOTH;
460:   /* lower triangular indices */
461:   ISGetIndices(isrow,&r);
462:   ISIdentity(isrow,&row_identity);
463:   if (!row_identity) {
464:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
465:     cusparseTriFactors->rpermIndices->assign(r, r+n);
466:   }
467:   ISRestoreIndices(isrow,&r);

469:   /* upper triangular indices */
470:   ISGetIndices(iscol,&c);
471:   ISIdentity(iscol,&col_identity);
472:   if (!col_identity) {
473:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
474:     cusparseTriFactors->cpermIndices->assign(c, c+n);
475:   }

477:   if (!row_identity && !col_identity) {
478:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
479:   } else if(!row_identity) {
480:     PetscLogCpuToGpu(n*sizeof(PetscInt));
481:   } else if(!col_identity) {
482:     PetscLogCpuToGpu(n*sizeof(PetscInt));
483:   }

485:   ISRestoreIndices(iscol,&c);
486:   return(0);
487: }

489: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
490: {
491:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
492:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
493:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
494:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
495:   cusparseStatus_t                  stat;
496:   PetscErrorCode                    ierr;
497:   cudaError_t                       cerr;
498:   PetscInt                          *AiUp, *AjUp;
499:   PetscScalar                       *AAUp;
500:   PetscScalar                       *AALo;
501:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
502:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
503:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
504:   const MatScalar                   *aa = b->a,*v;

507:   if (!n) return(0);
508:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
509:     try {
510:       /* Allocate Space for the upper triangular matrix */
511:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
512:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
513:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
514:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

516:       /* Fill the upper triangular matrix */
517:       AiUp[0]=(PetscInt) 0;
518:       AiUp[n]=nzUpper;
519:       offset = 0;
520:       for (i=0; i<n; i++) {
521:         /* set the pointers */
522:         v  = aa + ai[i];
523:         vj = aj + ai[i];
524:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

526:         /* first, set the diagonal elements */
527:         AjUp[offset] = (PetscInt) i;
528:         AAUp[offset] = (MatScalar)1.0/v[nz];
529:         AiUp[i]      = offset;
530:         AALo[offset] = (MatScalar)1.0/v[nz];

532:         offset+=1;
533:         if (nz>0) {
534:           PetscArraycpy(&(AjUp[offset]), vj, nz);
535:           PetscArraycpy(&(AAUp[offset]), v, nz);
536:           for (j=offset; j<offset+nz; j++) {
537:             AAUp[j] = -AAUp[j];
538:             AALo[j] = AAUp[j]/v[nz];
539:           }
540:           offset+=nz;
541:         }
542:       }

544:       /* allocate space for the triangular factor information */
545:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

547:       /* Create the matrix description */
548:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
549:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
550:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
551:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
552:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

554:       /* Create the solve analysis information */
555:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

557:       /* set the operation */
558:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

560:       /* set the matrix */
561:       upTriFactor->csrMat = new CsrMatrix;
562:       upTriFactor->csrMat->num_rows = A->rmap->n;
563:       upTriFactor->csrMat->num_cols = A->cmap->n;
564:       upTriFactor->csrMat->num_entries = a->nz;

566:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
567:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

569:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
570:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

572:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
573:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

575:       /* perform the solve analysis */
576:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
577:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
578:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
579:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);

581:       /* assign the pointer. Is this really necessary? */
582:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

584:       /* allocate space for the triangular factor information */
585:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

587:       /* Create the matrix description */
588:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
589:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
590:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
591:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
592:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

594:       /* Create the solve analysis information */
595:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

597:       /* set the operation */
598:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

600:       /* set the matrix */
601:       loTriFactor->csrMat = new CsrMatrix;
602:       loTriFactor->csrMat->num_rows = A->rmap->n;
603:       loTriFactor->csrMat->num_cols = A->cmap->n;
604:       loTriFactor->csrMat->num_entries = a->nz;

606:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
607:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

609:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
610:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

612:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
613:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
614:       PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));

616:       /* perform the solve analysis */
617:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
618:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
619:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
620:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);

622:       /* assign the pointer. Is this really necessary? */
623:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

625:       A->offloadmask = PETSC_OFFLOAD_BOTH;
626:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
627:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
628:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
629:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
630:     } catch(char *ex) {
631:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
632:     }
633:   }
634:   return(0);
635: }

637: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
638: {
639:   PetscErrorCode               ierr;
640:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
641:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
642:   IS                           ip = a->row;
643:   const PetscInt               *rip;
644:   PetscBool                    perm_identity;
645:   PetscInt                     n = A->rmap->n;

648:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
649:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
650:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
651:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

653:   /* lower triangular indices */
654:   ISGetIndices(ip,&rip);
655:   ISIdentity(ip,&perm_identity);
656:   if (!perm_identity) {
657:     IS             iip;
658:     const PetscInt *irip;

660:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
661:     ISGetIndices(iip,&irip);
662:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
663:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
664:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
665:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
666:     ISRestoreIndices(iip,&irip);
667:     ISDestroy(&iip);
668:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
669:   }
670:   ISRestoreIndices(ip,&rip);
671:   return(0);
672: }

674: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
675: {
676:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
677:   IS             isrow = b->row,iscol = b->col;
678:   PetscBool      row_identity,col_identity;

682:   MatLUFactorNumeric_SeqAIJ(B,A,info);
683:   B->offloadmask = PETSC_OFFLOAD_CPU;
684:   /* determine which version of MatSolve needs to be used. */
685:   ISIdentity(isrow,&row_identity);
686:   ISIdentity(iscol,&col_identity);
687:   if (row_identity && col_identity) {
688:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
689:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
690:     B->ops->matsolve = NULL;
691:     B->ops->matsolvetranspose = NULL;
692:   } else {
693:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
694:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
695:     B->ops->matsolve = NULL;
696:     B->ops->matsolvetranspose = NULL;
697:   }

699:   /* get the triangular factors */
700:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
701:   return(0);
702: }

704: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
705: {
706:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
707:   IS             ip = b->row;
708:   PetscBool      perm_identity;

712:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
713:   B->offloadmask = PETSC_OFFLOAD_CPU;
714:   /* determine which version of MatSolve needs to be used. */
715:   ISIdentity(ip,&perm_identity);
716:   if (perm_identity) {
717:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
718:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
719:     B->ops->matsolve = NULL;
720:     B->ops->matsolvetranspose = NULL;
721:   } else {
722:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
723:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
724:     B->ops->matsolve = NULL;
725:     B->ops->matsolvetranspose = NULL;
726:   }

728:   /* get the triangular factors */
729:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
730:   return(0);
731: }

733: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
734: {
735:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
736:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
737:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
738:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
739:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
740:   cusparseStatus_t                  stat;
741:   cusparseIndexBase_t               indexBase;
742:   cusparseMatrixType_t              matrixType;
743:   cusparseFillMode_t                fillMode;
744:   cusparseDiagType_t                diagType;


748:   /*********************************************/
749:   /* Now the Transpose of the Lower Tri Factor */
750:   /*********************************************/

752:   /* allocate space for the transpose of the lower triangular factor */
753:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

755:   /* set the matrix descriptors of the lower triangular factor */
756:   matrixType = cusparseGetMatType(loTriFactor->descr);
757:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
758:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
759:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
760:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

762:   /* Create the matrix description */
763:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
764:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
765:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
766:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
767:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

769:   /* Create the solve analysis information */
770:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

772:   /* set the operation */
773:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

775:   /* allocate GPU space for the CSC of the lower triangular factor*/
776:   loTriFactorT->csrMat = new CsrMatrix;
777:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
778:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
779:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
780:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
781:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
782:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

784:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
785:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
786:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
787:                           loTriFactor->csrMat->values->data().get(),
788:                           loTriFactor->csrMat->row_offsets->data().get(),
789:                           loTriFactor->csrMat->column_indices->data().get(),
790:                           loTriFactorT->csrMat->values->data().get(),
791:                           loTriFactorT->csrMat->column_indices->data().get(),
792:                           loTriFactorT->csrMat->row_offsets->data().get(),
793:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

795:   /* perform the solve analysis on the transposed matrix */
796:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
797:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
798:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
799:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
800:                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

802:   /* assign the pointer. Is this really necessary? */
803:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

805:   /*********************************************/
806:   /* Now the Transpose of the Upper Tri Factor */
807:   /*********************************************/

809:   /* allocate space for the transpose of the upper triangular factor */
810:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

812:   /* set the matrix descriptors of the upper triangular factor */
813:   matrixType = cusparseGetMatType(upTriFactor->descr);
814:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
815:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
816:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
817:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

819:   /* Create the matrix description */
820:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
821:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
822:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
823:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
824:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

826:   /* Create the solve analysis information */
827:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

829:   /* set the operation */
830:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

832:   /* allocate GPU space for the CSC of the upper triangular factor*/
833:   upTriFactorT->csrMat = new CsrMatrix;
834:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
835:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
836:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
837:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
838:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
839:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

841:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
842:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
843:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
844:                           upTriFactor->csrMat->values->data().get(),
845:                           upTriFactor->csrMat->row_offsets->data().get(),
846:                           upTriFactor->csrMat->column_indices->data().get(),
847:                           upTriFactorT->csrMat->values->data().get(),
848:                           upTriFactorT->csrMat->column_indices->data().get(),
849:                           upTriFactorT->csrMat->row_offsets->data().get(),
850:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

852:   /* perform the solve analysis on the transposed matrix */
853:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
854:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
855:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
856:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
857:                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);

859:   /* assign the pointer. Is this really necessary? */
860:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
861:   return(0);
862: }

864: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
865: {
866:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
867:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
868:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
869:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
870:   cusparseStatus_t             stat;
871:   cusparseIndexBase_t          indexBase;
872:   cudaError_t                  err;
873:   PetscErrorCode               ierr;


877:   /* allocate space for the triangular factor information */
878:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
879:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
880:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
881:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
882:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

884:   /* set alpha and beta */
885:   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
886:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
887:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
888:   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
889:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
890:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
891:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

893:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
894:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
895:     CsrMatrix *matrixT= new CsrMatrix;
896:     matrixT->num_rows = A->cmap->n;
897:     matrixT->num_cols = A->rmap->n;
898:     matrixT->num_entries = a->nz;
899:     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
900:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
901:     matrixT->values = new THRUSTARRAY(a->nz);

903:     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
904:     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
905:     /* compute the transpose, i.e. the CSC */
906:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
907:     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
908:                             A->cmap->n, matrix->num_entries,
909:                             matrix->values->data().get(),
910:                             cusparsestruct->rowoffsets_gpu->data().get(),
911:                             matrix->column_indices->data().get(),
912:                             matrixT->values->data().get(),
913:                             matrixT->column_indices->data().get(),
914:                             matrixT->row_offsets->data().get(),
915:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
916:     /* assign the pointer */
917:     matstructT->mat = matrixT;
918:     PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));
919:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
920:     /* First convert HYB to CSR */
921:     CsrMatrix *temp= new CsrMatrix;
922:     temp->num_rows = A->rmap->n;
923:     temp->num_cols = A->cmap->n;
924:     temp->num_entries = a->nz;
925:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
926:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
927:     temp->values = new THRUSTARRAY(a->nz);


930:     stat = cusparse_hyb2csr(cusparsestruct->handle,
931:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
932:                             temp->values->data().get(),
933:                             temp->row_offsets->data().get(),
934:                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);

936:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
937:     CsrMatrix *tempT= new CsrMatrix;
938:     tempT->num_rows = A->rmap->n;
939:     tempT->num_cols = A->cmap->n;
940:     tempT->num_entries = a->nz;
941:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
942:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
943:     tempT->values = new THRUSTARRAY(a->nz);

945:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
946:                             temp->num_cols, temp->num_entries,
947:                             temp->values->data().get(),
948:                             temp->row_offsets->data().get(),
949:                             temp->column_indices->data().get(),
950:                             tempT->values->data().get(),
951:                             tempT->column_indices->data().get(),
952:                             tempT->row_offsets->data().get(),
953:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

955:     /* Last, convert CSC to HYB */
956:     cusparseHybMat_t hybMat;
957:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
958:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
959:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
960:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
961:                             matstructT->descr, tempT->values->data().get(),
962:                             tempT->row_offsets->data().get(),
963:                             tempT->column_indices->data().get(),
964:                             hybMat, 0, partition);CHKERRCUSPARSE(stat);

966:     /* assign the pointer */
967:     matstructT->mat = hybMat;
968:     PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));

970:     /* delete temporaries */
971:     if (tempT) {
972:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
973:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
974:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
975:       delete (CsrMatrix*) tempT;
976:     }
977:     if (temp) {
978:       if (temp->values) delete (THRUSTARRAY*) temp->values;
979:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
980:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
981:       delete (CsrMatrix*) temp;
982:     }
983:   }
984:   /* the compressed row indices is not used for matTranspose */
985:   matstructT->cprowIndices = NULL;
986:   /* assign the pointer */
987:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
988:   return(0);
989: }

991: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
992: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
993: {
994:   PetscInt                              n = xx->map->n;
995:   const PetscScalar                     *barray;
996:   PetscScalar                           *xarray;
997:   thrust::device_ptr<const PetscScalar> bGPU;
998:   thrust::device_ptr<PetscScalar>       xGPU;
999:   cusparseStatus_t                      stat;
1000:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1001:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1002:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1003:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1004:   PetscErrorCode                        ierr;
1005:   cudaError_t                           cerr;

1008:   /* Analyze the matrix and create the transpose ... on the fly */
1009:   if (!loTriFactorT && !upTriFactorT) {
1010:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1011:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1012:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1013:   }

1015:   /* Get the GPU pointers */
1016:   VecCUDAGetArrayWrite(xx,&xarray);
1017:   VecCUDAGetArrayRead(bb,&barray);
1018:   xGPU = thrust::device_pointer_cast(xarray);
1019:   bGPU = thrust::device_pointer_cast(barray);

1021:   PetscLogGpuTimeBegin();
1022:   /* First, reorder with the row permutation */
1023:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1024:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1025:                xGPU);

1027:   /* First, solve U */
1028:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1029:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1030:                         upTriFactorT->csrMat->values->data().get(),
1031:                         upTriFactorT->csrMat->row_offsets->data().get(),
1032:                         upTriFactorT->csrMat->column_indices->data().get(),
1033:                         upTriFactorT->solveInfo,
1034:                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1036:   /* Then, solve L */
1037:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1038:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1039:                         loTriFactorT->csrMat->values->data().get(),
1040:                         loTriFactorT->csrMat->row_offsets->data().get(),
1041:                         loTriFactorT->csrMat->column_indices->data().get(),
1042:                         loTriFactorT->solveInfo,
1043:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1045:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1046:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1047:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1048:                tempGPU->begin());

1050:   /* Copy the temporary to the full solution. */
1051:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1053:   /* restore */
1054:   VecCUDARestoreArrayRead(bb,&barray);
1055:   VecCUDARestoreArrayWrite(xx,&xarray);
1056:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1057:   PetscLogGpuTimeEnd();
1058:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1059:   return(0);
1060: }

1062: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1063: {
1064:   const PetscScalar                 *barray;
1065:   PetscScalar                       *xarray;
1066:   cusparseStatus_t                  stat;
1067:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1068:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1069:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1070:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1071:   PetscErrorCode                    ierr;
1072:   cudaError_t                       cerr;

1075:   /* Analyze the matrix and create the transpose ... on the fly */
1076:   if (!loTriFactorT && !upTriFactorT) {
1077:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1078:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1079:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1080:   }

1082:   /* Get the GPU pointers */
1083:   VecCUDAGetArrayWrite(xx,&xarray);
1084:   VecCUDAGetArrayRead(bb,&barray);

1086:   PetscLogGpuTimeBegin();
1087:   /* First, solve U */
1088:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1089:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1090:                         upTriFactorT->csrMat->values->data().get(),
1091:                         upTriFactorT->csrMat->row_offsets->data().get(),
1092:                         upTriFactorT->csrMat->column_indices->data().get(),
1093:                         upTriFactorT->solveInfo,
1094:                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1096:   /* Then, solve L */
1097:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1098:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1099:                         loTriFactorT->csrMat->values->data().get(),
1100:                         loTriFactorT->csrMat->row_offsets->data().get(),
1101:                         loTriFactorT->csrMat->column_indices->data().get(),
1102:                         loTriFactorT->solveInfo,
1103:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1105:   /* restore */
1106:   VecCUDARestoreArrayRead(bb,&barray);
1107:   VecCUDARestoreArrayWrite(xx,&xarray);
1108:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1109:   PetscLogGpuTimeEnd();
1110:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1111:   return(0);
1112: }

1114: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1115: {
1116:   const PetscScalar                     *barray;
1117:   PetscScalar                           *xarray;
1118:   thrust::device_ptr<const PetscScalar> bGPU;
1119:   thrust::device_ptr<PetscScalar>       xGPU;
1120:   cusparseStatus_t                      stat;
1121:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1122:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1123:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1124:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1125:   PetscErrorCode                        ierr;
1126:   cudaError_t                           cerr;


1130:   /* Get the GPU pointers */
1131:   VecCUDAGetArrayWrite(xx,&xarray);
1132:   VecCUDAGetArrayRead(bb,&barray);
1133:   xGPU = thrust::device_pointer_cast(xarray);
1134:   bGPU = thrust::device_pointer_cast(barray);

1136:   PetscLogGpuTimeBegin();
1137:   /* First, reorder with the row permutation */
1138:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1139:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1140:                tempGPU->begin());

1142:   /* Next, solve L */
1143:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1144:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1145:                         loTriFactor->csrMat->values->data().get(),
1146:                         loTriFactor->csrMat->row_offsets->data().get(),
1147:                         loTriFactor->csrMat->column_indices->data().get(),
1148:                         loTriFactor->solveInfo,
1149:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1151:   /* Then, solve U */
1152:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1153:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1154:                         upTriFactor->csrMat->values->data().get(),
1155:                         upTriFactor->csrMat->row_offsets->data().get(),
1156:                         upTriFactor->csrMat->column_indices->data().get(),
1157:                         upTriFactor->solveInfo,
1158:                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1160:   /* Last, reorder with the column permutation */
1161:   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1162:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1163:                xGPU);

1165:   VecCUDARestoreArrayRead(bb,&barray);
1166:   VecCUDARestoreArrayWrite(xx,&xarray);
1167:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1168:   PetscLogGpuTimeEnd();
1169:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1170:   return(0);
1171: }

1173: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1174: {
1175:   const PetscScalar                 *barray;
1176:   PetscScalar                       *xarray;
1177:   cusparseStatus_t                  stat;
1178:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1179:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1180:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1181:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1182:   PetscErrorCode                    ierr;
1183:   cudaError_t                       cerr;

1186:   /* Get the GPU pointers */
1187:   VecCUDAGetArrayWrite(xx,&xarray);
1188:   VecCUDAGetArrayRead(bb,&barray);

1190:   PetscLogGpuTimeBegin();
1191:   /* First, solve L */
1192:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1193:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1194:                         loTriFactor->csrMat->values->data().get(),
1195:                         loTriFactor->csrMat->row_offsets->data().get(),
1196:                         loTriFactor->csrMat->column_indices->data().get(),
1197:                         loTriFactor->solveInfo,
1198:                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);

1200:   /* Next, solve U */
1201:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1202:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1203:                         upTriFactor->csrMat->values->data().get(),
1204:                         upTriFactor->csrMat->row_offsets->data().get(),
1205:                         upTriFactor->csrMat->column_indices->data().get(),
1206:                         upTriFactor->solveInfo,
1207:                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);

1209:   VecCUDARestoreArrayRead(bb,&barray);
1210:   VecCUDARestoreArrayWrite(xx,&xarray);
1211:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1212:   PetscLogGpuTimeEnd();
1213:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1214:   return(0);
1215: }

1217: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1218: {
1219:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1220:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1221:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1222:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1223:   PetscErrorCode               ierr;
1224:   cusparseStatus_t             stat;
1225:   cudaError_t                  err;

1228:   if (A->boundtocpu) return(0);
1229:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1230:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1231:     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232:       /* Copy values only */
1233:       CsrMatrix *mat,*matT;
1234:       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1235:       mat->values->assign(a->a, a->a+a->nz);
1236:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));

1238:       /* Update matT when it was built before */
1239:       if (cusparsestruct->matTranspose) {
1240:         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1241:         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1242:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1243:                                  A->cmap->n, mat->num_entries,
1244:                                  mat->values->data().get(),
1245:                                  cusparsestruct->rowoffsets_gpu->data().get(),
1246:                                  mat->column_indices->data().get(),
1247:                                  matT->values->data().get(),
1248:                                  matT->column_indices->data().get(),
1249:                                  matT->row_offsets->data().get(),
1250:                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1251:       }
1252:     } else {
1253:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1254:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1255:       delete cusparsestruct->workVector;
1256:       delete cusparsestruct->rowoffsets_gpu;
1257:       try {
1258:         if (a->compressedrow.use) {
1259:           m    = a->compressedrow.nrows;
1260:           ii   = a->compressedrow.i;
1261:           ridx = a->compressedrow.rindex;
1262:         } else {
1263:           m    = A->rmap->n;
1264:           ii   = a->i;
1265:           ridx = NULL; /* In this case, one should not dereference ridx */
1266:         }
1267:         cusparsestruct->nrows = m;

1269:         /* allocate space for the triangular factor information */
1270:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1271:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1272:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1273:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1275:         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1276:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1277:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1278:         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1279:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1280:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1281:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1283:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1284:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1285:           /* set the matrix */
1286:           CsrMatrix *matrix= new CsrMatrix;
1287:           matrix->num_rows = m;
1288:           matrix->num_cols = A->cmap->n;
1289:           matrix->num_entries = a->nz;
1290:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1291:           matrix->row_offsets->assign(ii, ii + m+1);

1293:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1294:           matrix->column_indices->assign(a->j, a->j+a->nz);

1296:           matrix->values = new THRUSTARRAY(a->nz);
1297:           matrix->values->assign(a->a, a->a+a->nz);

1299:           /* assign the pointer */
1300:           matstruct->mat = matrix;

1302:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1303:           CsrMatrix *matrix= new CsrMatrix;
1304:           matrix->num_rows = m;
1305:           matrix->num_cols = A->cmap->n;
1306:           matrix->num_entries = a->nz;
1307:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1308:           matrix->row_offsets->assign(ii, ii + m+1);

1310:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1311:           matrix->column_indices->assign(a->j, a->j+a->nz);

1313:           matrix->values = new THRUSTARRAY(a->nz);
1314:           matrix->values->assign(a->a, a->a+a->nz);

1316:           cusparseHybMat_t hybMat;
1317:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1318:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1319:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1320:           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1321:               matstruct->descr, matrix->values->data().get(),
1322:               matrix->row_offsets->data().get(),
1323:               matrix->column_indices->data().get(),
1324:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1325:           /* assign the pointer */
1326:           matstruct->mat = hybMat;

1328:           if (matrix) {
1329:             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1330:             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1331:             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1332:             delete (CsrMatrix*)matrix;
1333:           }
1334:         }

1336:         /* assign the compressed row indices */
1337:         if (a->compressedrow.use) {
1338:           cusparsestruct->workVector = new THRUSTARRAY(m);
1339:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1340:           matstruct->cprowIndices->assign(ridx,ridx+m);
1341:           tmp = m;
1342:         } else {
1343:           cusparsestruct->workVector = NULL;
1344:           matstruct->cprowIndices    = NULL;
1345:           tmp = 0;
1346:         }
1347:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1349:         /* assign the pointer */
1350:         cusparsestruct->mat = matstruct;
1351:       } catch(char *ex) {
1352:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1353:       }
1354:       cusparsestruct->nonzerostate = A->nonzerostate;
1355:     }
1356:     err  = WaitForGPU();CHKERRCUDA(err);
1357:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1358:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1359:   }
1360:   return(0);
1361: }

1363: struct VecCUDAPlusEquals
1364: {
1365:   template <typename Tuple>
1366:   __host__ __device__
1367:   void operator()(Tuple t)
1368:   {
1369:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1370:   }
1371: };

1373: typedef struct {
1374:   PetscBool   cisdense;
1375:   PetscScalar *Bt;
1376:   Mat         X;
1377: } MatMatCusparse;

1379: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1380: {
1382:   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1383:   cudaError_t    cerr;

1386:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1387:   MatDestroy(&mmdata->X);
1388:   PetscFree(data);
1389:   return(0);
1390: }

1392: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

1394: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1395: {
1396:   Mat_Product                  *product = C->product;
1397:   Mat                          A,B;
1398:   PetscInt                     m,n,k,blda,clda;
1399:   PetscBool                    flg,biscuda;
1400:   Mat_SeqAIJCUSPARSE           *cusp;
1401:   cusparseStatus_t             stat;
1402:   cusparseOperation_t          opA;
1403:   cusparseMatDescr_t           matA;
1404:   const PetscScalar            *barray;
1405:   PetscScalar                  *carray;
1406:   PetscErrorCode               ierr;
1407:   MatMatCusparse               *mmdata;
1408:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1409:   CsrMatrix                    *csrmat;

1412:   MatCheckProduct(C,1);
1413:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1414:   mmdata = (MatMatCusparse*)product->data;
1415:   A    = product->A;
1416:   B    = product->B;
1417:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1418:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1419:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1420:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1421:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1422:   MatSeqAIJCUSPARSECopyToGPU(A);
1423:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1424:   switch (product->type) {
1425:   case MATPRODUCT_AB:
1426:   case MATPRODUCT_PtAP:
1427:     mat = cusp->mat;
1428:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1429:     m   = A->rmap->n;
1430:     k   = A->cmap->n;
1431:     n   = B->cmap->n;
1432:     break;
1433:   case MATPRODUCT_AtB:
1434:     if (!cusp->matTranspose) {
1435:       MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1436:     }
1437:     mat = cusp->matTranspose;
1438:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1439:     m   = A->cmap->n;
1440:     k   = A->rmap->n;
1441:     n   = B->cmap->n;
1442:     break;
1443:   case MATPRODUCT_ABt:
1444:   case MATPRODUCT_RARt:
1445:     mat = cusp->mat;
1446:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1447:     m   = A->rmap->n;
1448:     k   = B->cmap->n;
1449:     n   = B->rmap->n;
1450:     break;
1451:   default:
1452:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1453:   }
1454:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1455:   matA   = mat->descr;
1456:   csrmat = (CsrMatrix*)mat->mat;
1457:   /* if the user passed a CPU matrix, copy the data to the GPU */
1458:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
1459:   if (!biscuda) {
1460:     MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1461:   }
1462:   MatDenseCUDAGetArrayRead(B,&barray);
1463:   MatDenseGetLDA(B,&blda);
1464:   /* cusparse spmm does not support transpose on B */
1465:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1466:     cublasHandle_t cublasv2handle;
1467:     cublasStatus_t cerr;

1469:     PetscCUBLASGetHandle(&cublasv2handle);
1470:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1471:                        B->cmap->n,B->rmap->n,
1472:                        &PETSC_CUSPARSE_ONE ,barray,blda,
1473:                        &PETSC_CUSPARSE_ZERO,barray,blda,
1474:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1475:     blda = B->cmap->n;
1476:   }
1477:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1478:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
1479:     MatDenseGetLDA(mmdata->X,&clda);
1480:   } else {
1481:     MatDenseCUDAGetArrayWrite(C,&carray);
1482:     MatDenseGetLDA(C,&clda);
1483:   }

1485:   /* perform the MatMat operation */
1486:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1487:                            csrmat->num_entries,mat->alpha,matA,
1488:                            csrmat->values->data().get(),
1489:                            csrmat->row_offsets->data().get(),
1490:                            csrmat->column_indices->data().get(),
1491:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1492:                            carray,clda);CHKERRCUSPARSE(stat);

1494:   MatDenseCUDARestoreArrayRead(B,&barray);
1495:   if (product->type == MATPRODUCT_RARt) {
1496:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1497:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
1498:   } else if (product->type == MATPRODUCT_PtAP) {
1499:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1500:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
1501:   } else {
1502:     MatDenseCUDARestoreArrayWrite(C,&carray);
1503:   }
1504:   if (mmdata->cisdense) {
1505:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
1506:   }
1507:   if (!biscuda) {
1508:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1509:   }
1510:   return(0);
1511: }

1513: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1514: {
1515:   Mat_Product        *product = C->product;
1516:   Mat                A,B;
1517:   PetscInt           m,n;
1518:   PetscBool          cisdense,flg;
1519:   PetscErrorCode     ierr;
1520:   MatMatCusparse     *mmdata;
1521:   Mat_SeqAIJCUSPARSE *cusp;

1524:   MatCheckProduct(C,1);
1525:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1526:   A    = product->A;
1527:   B    = product->B;
1528:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1529:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1530:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1531:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1532:   switch (product->type) {
1533:   case MATPRODUCT_AB:
1534:     m = A->rmap->n;
1535:     n = B->cmap->n;
1536:     break;
1537:   case MATPRODUCT_AtB:
1538:     m = A->cmap->n;
1539:     n = B->cmap->n;
1540:     break;
1541:   case MATPRODUCT_ABt:
1542:     m = A->rmap->n;
1543:     n = B->rmap->n;
1544:     break;
1545:   case MATPRODUCT_PtAP:
1546:     m = B->cmap->n;
1547:     n = B->cmap->n;
1548:     break;
1549:   case MATPRODUCT_RARt:
1550:     m = B->rmap->n;
1551:     n = B->rmap->n;
1552:     break;
1553:   default:
1554:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1555:   }
1556:   MatSetSizes(C,m,n,m,n);
1557:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1558:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
1559:   MatSetType(C,MATSEQDENSECUDA);

1561:   /* product data */
1562:   PetscNew(&mmdata);
1563:   mmdata->cisdense = cisdense;
1564:   /* cusparse spmm does not support transpose on B */
1565:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1566:     cudaError_t cerr;

1568:     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1569:   }
1570:   /* for these products we need intermediate storage */
1571:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1572:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
1573:     MatSetType(mmdata->X,MATSEQDENSECUDA);
1574:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1575:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
1576:     } else {
1577:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
1578:     }
1579:   }
1580:   C->product->data    = mmdata;
1581:   C->product->destroy = MatDestroy_MatMatCusparse;

1583:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1584:   return(0);
1585: }

1587: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

1589: /* handles dense B */
1590: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1591: {
1592:   Mat_Product    *product = C->product;

1596:   MatCheckProduct(C,1);
1597:   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1598:   if (product->A->boundtocpu) {
1599:     MatProductSetFromOptions_SeqAIJ_SeqDense(C);
1600:     return(0);
1601:   }
1602:   switch (product->type) {
1603:   case MATPRODUCT_AB:
1604:   case MATPRODUCT_AtB:
1605:   case MATPRODUCT_ABt:
1606:   case MATPRODUCT_PtAP:
1607:   case MATPRODUCT_RARt:
1608:     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1609:   default:
1610:     break;
1611:   }
1612:   return(0);
1613: }

1615: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1616: {

1620:   MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1621:   return(0);
1622: }

1624: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1625: {
1626:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1627:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1628:   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1629:   const PetscScalar            *xarray;
1630:   PetscScalar                  *yarray;
1631:   PetscErrorCode               ierr;
1632:   cudaError_t                  cerr;
1633:   cusparseStatus_t             stat;

1636:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1637:   MatSeqAIJCUSPARSECopyToGPU(A);
1638:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1639:   if (!matstructT) {
1640:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1641:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1642:   }
1643:   VecCUDAGetArrayRead(xx,&xarray);
1644:   VecCUDAGetArrayWrite(yy,&yarray);
1645:   if (yy->map->n) {
1646:     PetscInt                     n = yy->map->n;
1647:     cudaError_t                  err;
1648:     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1649:   }

1651:   PetscLogGpuTimeBegin();
1652:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1653:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1654:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1655:                              mat->num_rows, mat->num_cols,
1656:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1657:                              mat->values->data().get(), mat->row_offsets->data().get(),
1658:                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1659:                              yarray);CHKERRCUSPARSE(stat);
1660:   } else {
1661:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1662:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1663:                              matstructT->alpha, matstructT->descr, hybMat,
1664:                              xarray, matstructT->beta_zero,
1665:                              yarray);CHKERRCUSPARSE(stat);
1666:   }
1667:   VecCUDARestoreArrayRead(xx,&xarray);
1668:   VecCUDARestoreArrayWrite(yy,&yarray);
1669:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1670:   PetscLogGpuTimeEnd();
1671:   PetscLogGpuFlops(2.0*a->nz);
1672:   return(0);
1673: }


1676: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1677: {
1678:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1679:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1680:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1681:   const PetscScalar            *xarray;
1682:   PetscScalar                  *zarray,*dptr,*beta;
1683:   PetscErrorCode               ierr;
1684:   cudaError_t                  cerr;
1685:   cusparseStatus_t             stat;
1686:   PetscBool                    compressed = a->compressedrow.use; /* Does the matrix use compressed rows (i.e., drop zero rows)? */

1689:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1690:   MatSeqAIJCUSPARSECopyToGPU(A);
1691:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1692:   try {
1693:     VecCUDAGetArrayRead(xx,&xarray);

1695:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
1696:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */

1698:     dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv result if compressed */
1699:     beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;

1701:     PetscLogGpuTimeBegin();
1702:     /* csr_spmv does y = alpha*Ax + beta*y */
1703:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1704:       /* here we need to be careful to set the number of rows in the multiply to the
1705:          number of compressed rows in the matrix ... which is equivalent to the
1706:          size of the workVector */
1707:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1708:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1709:                                mat->num_rows, mat->num_cols,
1710:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1711:                                mat->values->data().get(), mat->row_offsets->data().get(),
1712:                                mat->column_indices->data().get(), xarray, beta,
1713:                                dptr);CHKERRCUSPARSE(stat);
1714:     } else {
1715:       if (cusparsestruct->nrows) {
1716:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1717:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1718:                                  matstruct->alpha, matstruct->descr, hybMat,
1719:                                  xarray, beta,
1720:                                  dptr);CHKERRCUSPARSE(stat);
1721:       }
1722:     }
1723:     PetscLogGpuTimeEnd();

1725:     if (yy) { /* MatMultAdd: zz = A*xx + yy */
1726:       if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1727:         VecCopy_SeqCUDA(yy,zz); /* zz = yy */
1728:       } else if (zz != yy) { /* A's not compreseed. zz already contains A*xx, and we just need to add yy */
1729:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
1730:       }
1731:     } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1732:       VecSet_SeqCUDA(zz,0);
1733:     }

1735:     /* ScatterAdd the result from work vector into the full vector when A is compressed */
1736:     PetscLogGpuTimeBegin();
1737:     if (compressed) {
1738:       thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1739:       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1740:                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1741:                        VecCUDAPlusEquals());
1742:     }
1743:     PetscLogGpuTimeEnd();
1744:     VecCUDARestoreArrayRead(xx,&xarray);
1745:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
1746:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
1747:   } catch(char *ex) {
1748:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1749:   }
1750:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1751:   PetscLogGpuFlops(2.0*a->nz);
1752:   return(0);
1753: }

1755: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1756: {
1757:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1758:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1759:   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1760:   const PetscScalar               *xarray;
1761:   PetscScalar                     *zarray,*beta;
1762:   PetscErrorCode                  ierr;
1763:   cudaError_t                     cerr;
1764:   cusparseStatus_t                stat;

1767:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1768:   MatSeqAIJCUSPARSECopyToGPU(A);
1769:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1770:   if (!matstructT) {
1771:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1772:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1773:   }

1775:   /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1776:   try {
1777:     VecCopy_SeqCUDA(yy,zz);
1778:     VecCUDAGetArrayRead(xx,&xarray);
1779:     VecCUDAGetArray(zz,&zarray);
1780:     beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;

1782:     PetscLogGpuTimeBegin();
1783:     /* multiply add with matrix transpose */
1784:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1785:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1786:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1787:                                mat->num_rows, mat->num_cols,
1788:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1789:                                mat->values->data().get(), mat->row_offsets->data().get(),
1790:                                mat->column_indices->data().get(), xarray, beta,
1791:                                zarray);CHKERRCUSPARSE(stat);
1792:     } else {
1793:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1794:       if (cusparsestruct->nrows) {
1795:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1796:                                  matstructT->alpha, matstructT->descr, hybMat,
1797:                                  xarray, beta,
1798:                                  zarray);CHKERRCUSPARSE(stat);
1799:       }
1800:     }
1801:     PetscLogGpuTimeEnd();

1803:     if (zz != yy) {VecAXPY_SeqCUDA(zz,1.0,yy);}
1804:     VecCUDARestoreArrayRead(xx,&xarray);
1805:     VecCUDARestoreArray(zz,&zarray);
1806:   } catch(char *ex) {
1807:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1808:   }
1809:   cerr = WaitForGPU();CHKERRCUDA(cerr);
1810:   PetscLogGpuFlops(2.0*a->nz);
1811:   return(0);
1812: }

1814: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1815: {

1819:   MatAssemblyEnd_SeqAIJ(A,mode);
1820:   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
1821:   if (A->factortype == MAT_FACTOR_NONE) {
1822:     MatSeqAIJCUSPARSECopyToGPU(A);
1823:   }
1824:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1825:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1826:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1827:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1828:   return(0);
1829: }

1831: /* --------------------------------------------------------------------------------*/
1832: /*@
1833:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1834:    (the default parallel PETSc format). This matrix will ultimately pushed down
1835:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1836:    assembly performance the user should preallocate the matrix storage by setting
1837:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1838:    performance during matrix assembly can be increased by more than a factor of 50.

1840:    Collective

1842:    Input Parameters:
1843: +  comm - MPI communicator, set to PETSC_COMM_SELF
1844: .  m - number of rows
1845: .  n - number of columns
1846: .  nz - number of nonzeros per row (same for all rows)
1847: -  nnz - array containing the number of nonzeros in the various rows
1848:          (possibly different for each row) or NULL

1850:    Output Parameter:
1851: .  A - the matrix

1853:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1854:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1855:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1857:    Notes:
1858:    If nnz is given then nz is ignored

1860:    The AIJ format (also called the Yale sparse matrix format or
1861:    compressed row storage), is fully compatible with standard Fortran 77
1862:    storage.  That is, the stored row and column indices can begin at
1863:    either one (as in Fortran) or zero.  See the users' manual for details.

1865:    Specify the preallocated storage with either nz or nnz (not both).
1866:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1867:    allocation.  For large problems you MUST preallocate memory or you
1868:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1870:    By default, this format uses inodes (identical nodes) when possible, to
1871:    improve numerical efficiency of matrix-vector products and solves. We
1872:    search for consecutive rows with the same nonzero structure, thereby
1873:    reusing matrix information to achieve increased efficiency.

1875:    Level: intermediate

1877: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1878: @*/
1879: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1880: {

1884:   MatCreate(comm,A);
1885:   MatSetSizes(*A,m,n,m,n);
1886:   MatSetType(*A,MATSEQAIJCUSPARSE);
1887:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1888:   return(0);
1889: }

1891: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1892: {

1896:   if (A->factortype == MAT_FACTOR_NONE) {
1897:     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1898:       MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1899:     }
1900:   } else {
1901:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1902:   }
1903:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
1904:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
1905:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
1906:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
1907:   MatDestroy_SeqAIJ(A);
1908:   return(0);
1909: }

1911: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1912: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1913: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1914: {

1918:   MatDuplicate_SeqAIJ(A,cpvalues,B);
1919:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
1920:   return(0);
1921: }

1923: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1924: {
1926:   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1927:      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1928:      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case.
1929:      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1930:            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1931:   /* We need to take care of function composition also */
1932:   if (A->offloadmask == PETSC_OFFLOAD_GPU) return(0);
1933:   if (flg) {
1934:     A->ops->mult             = MatMult_SeqAIJ;
1935:     A->ops->multadd          = MatMultAdd_SeqAIJ;
1936:     A->ops->multtranspose    = MatMultTranspose_SeqAIJ;
1937:     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
1938:     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
1939:     A->ops->duplicate        = MatDuplicate_SeqAIJ;
1940:   } else {
1941:     A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1942:     A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1943:     A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1944:     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1945:     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1946:     A->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1947:     A->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1948:   }
1949:   A->boundtocpu = flg;
1950:   return(0);
1951: }

1953: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
1954: {
1956:   cusparseStatus_t stat;
1957:   cusparseHandle_t handle=0;

1960:   if (reuse != MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet supported");
1961:   PetscFree(B->defaultvectype);
1962:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

1964:   if (B->factortype == MAT_FACTOR_NONE) {
1965:     /* you cannot check the inode.use flag here since the matrix was just created.
1966:        now build a GPU matrix data structure */
1967:     B->spptr = new Mat_SeqAIJCUSPARSE;
1968:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat            = 0;
1969:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose   = 0;
1970:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector     = 0;
1971:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1972:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format         = MAT_CUSPARSE_CSR;
1973:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream         = 0;
1974:     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1975:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle         = handle;
1976:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate   = 0;
1977:   } else {
1978:     /* NEXT, set the pointers to the triangular factors */
1979:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1980:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1981:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1982:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1983:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1984:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1985:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1986:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1987:     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1988:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1989:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1990:   }

1992:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1993:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1994:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1995:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1996:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1997:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1998:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1999:   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
2000:   B->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;

2002:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

2004:   B->boundtocpu  = PETSC_FALSE;
2005:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

2007:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
2008:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2009:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2010:   return(0);
2011: }

2013: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2014: {

2018:   MatCreate_SeqAIJ(B);
2019:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
2020:   return(0);
2021: }

2023: /*MC
2024:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

2026:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2027:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2028:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

2030:    Options Database Keys:
2031: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2032: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2033: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

2035:   Level: beginner

2037: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2038: M*/

2040: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


2043: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2044: {

2048:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
2049:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
2050:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
2051:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
2052:   return(0);
2053: }


2056: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2057: {
2058:   cusparseStatus_t stat;
2059:   cusparseHandle_t handle;

2062:   if (*cusparsestruct) {
2063:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2064:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2065:     delete (*cusparsestruct)->workVector;
2066:     delete (*cusparsestruct)->rowoffsets_gpu;
2067:     if (handle = (*cusparsestruct)->handle) {
2068:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2069:     }
2070:     delete *cusparsestruct;
2071:     *cusparsestruct = 0;
2072:   }
2073:   return(0);
2074: }

2076: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2077: {
2079:   if (*mat) {
2080:     delete (*mat)->values;
2081:     delete (*mat)->column_indices;
2082:     delete (*mat)->row_offsets;
2083:     delete *mat;
2084:     *mat = 0;
2085:   }
2086:   return(0);
2087: }

2089: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2090: {
2091:   cusparseStatus_t stat;
2092:   PetscErrorCode   ierr;

2095:   if (*trifactor) {
2096:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2097:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2098:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
2099:     delete *trifactor;
2100:     *trifactor = 0;
2101:   }
2102:   return(0);
2103: }

2105: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2106: {
2107:   CsrMatrix        *mat;
2108:   cusparseStatus_t stat;
2109:   cudaError_t      err;

2112:   if (*matstruct) {
2113:     if ((*matstruct)->mat) {
2114:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2115:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2116:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2117:       } else {
2118:         mat = (CsrMatrix*)(*matstruct)->mat;
2119:         CsrMatrix_Destroy(&mat);
2120:       }
2121:     }
2122:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2123:     delete (*matstruct)->cprowIndices;
2124:     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2125:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2126:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2127:     delete *matstruct;
2128:     *matstruct = 0;
2129:   }
2130:   return(0);
2131: }

2133: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2134: {
2136:   if (*trifactors) {
2137:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2138:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2139:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2140:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2141:     delete (*trifactors)->rpermIndices;
2142:     delete (*trifactors)->cpermIndices;
2143:     delete (*trifactors)->workVector;
2144:     (*trifactors)->rpermIndices = 0;
2145:     (*trifactors)->cpermIndices = 0;
2146:     (*trifactors)->workVector = 0;
2147:   }
2148:   return(0);
2149: }

2151: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2152: {
2153:   cusparseHandle_t handle;
2154:   cusparseStatus_t stat;

2157:   if (*trifactors) {
2158:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2159:     if (handle = (*trifactors)->handle) {
2160:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2161:     }
2162:     delete *trifactors;
2163:     *trifactors = 0;
2164:   }
2165:   return(0);
2166: }