Actual source code: aijcusparse.cu

petsc-master 2019-09-15
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9:  #include <../src/mat/impls/aij/seq/aij.h>
 10:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
 11:  #include <../src/vec/vec/impls/dvecimpl.h>
 12:  #include <petsc/private/vecimpl.h>
 13: #undef VecType
 14:  #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 16: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 18: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 19: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 20: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 22: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 23: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 24: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 26: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 27: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 28: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 29: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 30: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 31: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 32: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 33: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 34: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 36: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 37: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 38: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 39: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 40: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 42: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 43: {
 44:   cusparseStatus_t   stat;
 45:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 48:   cusparsestruct->stream = stream;
 49:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
 50:   return(0);
 51: }

 53: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 54: {
 55:   cusparseStatus_t   stat;
 56:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 59:   if (cusparsestruct->handle != handle) {
 60:     if (cusparsestruct->handle) {
 61:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
 62:     }
 63:     cusparsestruct->handle = handle;
 64:   }
 65:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
 66:   return(0);
 67: }

 69: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 70: {
 71:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
 73:   if (cusparsestruct->handle)
 74:     cusparsestruct->handle = 0;
 75:   return(0);
 76: }

 78: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
 79: {
 81:   *type = MATSOLVERCUSPARSE;
 82:   return(0);
 83: }

 85: /*MC
 86:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 87:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 88:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 89:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 90:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 91:   algorithms are not recommended. This class does NOT support direct solver operations.

 93:   Level: beginner

 95: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 96: M*/

 98: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
 99: {
101:   PetscInt       n = A->rmap->n;

104:   MatCreate(PetscObjectComm((PetscObject)A),B);
105:   (*B)->factortype = ftype;
106:   MatSetSizes(*B,n,n,n,n);
107:   MatSetType(*B,MATSEQAIJCUSPARSE);

109:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
110:     MatSetBlockSizesFromMats(*B,A,A);
111:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
112:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
113:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
114:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
115:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
116:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

118:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
119:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
120:   return(0);
121: }

123: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
124: {
125:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

128:   switch (op) {
129:   case MAT_CUSPARSE_MULT:
130:     cusparsestruct->format = format;
131:     break;
132:   case MAT_CUSPARSE_ALL:
133:     cusparsestruct->format = format;
134:     break;
135:   default:
136:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137:   }
138:   return(0);
139: }

141: /*@
142:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
143:    operation. Only the MatMult operation can use different GPU storage formats
144:    for MPIAIJCUSPARSE matrices.
145:    Not Collective

147:    Input Parameters:
148: +  A - Matrix of type SEQAIJCUSPARSE
149: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
150: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

152:    Output Parameter:

154:    Level: intermediate

156: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
157: @*/
158: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
159: {

164:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
165:   return(0);
166: }

168: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
169: {
170:   PetscErrorCode           ierr;
171:   MatCUSPARSEStorageFormat format;
172:   PetscBool                flg;
173:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

176:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
177:   if (A->factortype==MAT_FACTOR_NONE) {
178:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
179:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
180:     if (flg) {
181:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
182:     }
183:   }
184:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
185:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
186:   if (flg) {
187:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
188:   }
189:   PetscOptionsTail();
190:   return(0);

192: }

194: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
195: {

199:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
200:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
201:   return(0);
202: }

204: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
205: {

209:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
210:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
211:   return(0);
212: }

214: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
215: {

219:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
220:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
221:   return(0);
222: }

224: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
225: {

229:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
230:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
231:   return(0);
232: }

234: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
235: {
236:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
237:   PetscInt                          n = A->rmap->n;
238:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
239:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
240:   cusparseStatus_t                  stat;
241:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
242:   const MatScalar                   *aa = a->a,*v;
243:   PetscInt                          *AiLo, *AjLo;
244:   PetscScalar                       *AALo;
245:   PetscInt                          i,nz, nzLower, offset, rowOffset;
246:   PetscErrorCode                    ierr;

249:   if (!n) return(0);
250:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
251:     try {
252:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
253:       nzLower=n+ai[n]-ai[1];

255:       /* Allocate Space for the lower triangular matrix */
256:       cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
257:       cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
258:       cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);

260:       /* Fill the lower triangular matrix */
261:       AiLo[0]  = (PetscInt) 0;
262:       AiLo[n]  = nzLower;
263:       AjLo[0]  = (PetscInt) 0;
264:       AALo[0]  = (MatScalar) 1.0;
265:       v        = aa;
266:       vi       = aj;
267:       offset   = 1;
268:       rowOffset= 1;
269:       for (i=1; i<n; i++) {
270:         nz = ai[i+1] - ai[i];
271:         /* additional 1 for the term on the diagonal */
272:         AiLo[i]    = rowOffset;
273:         rowOffset += nz+1;

275:         PetscArraycpy(&(AjLo[offset]), vi, nz);
276:         PetscArraycpy(&(AALo[offset]), v, nz);

278:         offset      += nz;
279:         AjLo[offset] = (PetscInt) i;
280:         AALo[offset] = (MatScalar) 1.0;
281:         offset      += 1;

283:         v  += nz;
284:         vi += nz;
285:       }

287:       /* allocate space for the triangular factor information */
288:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

290:       /* Create the matrix description */
291:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
292:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
293:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
294:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
295:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

297:       /* Create the solve analysis information */
298:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

300:       /* set the operation */
301:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

303:       /* set the matrix */
304:       loTriFactor->csrMat = new CsrMatrix;
305:       loTriFactor->csrMat->num_rows = n;
306:       loTriFactor->csrMat->num_cols = n;
307:       loTriFactor->csrMat->num_entries = nzLower;

309:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
310:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

312:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
313:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

315:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
316:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

318:       /* perform the solve analysis */
319:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
320:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
321:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
322:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

324:       /* assign the pointer. Is this really necessary? */
325:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

327:       cudaFreeHost(AiLo);CHKERRCUDA(ierr);
328:       cudaFreeHost(AjLo);CHKERRCUDA(ierr);
329:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
330:       PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
331:     } catch(char *ex) {
332:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
333:     }
334:   }
335:   return(0);
336: }

338: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
339: {
340:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
341:   PetscInt                          n = A->rmap->n;
342:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
343:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
344:   cusparseStatus_t                  stat;
345:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
346:   const MatScalar                   *aa = a->a,*v;
347:   PetscInt                          *AiUp, *AjUp;
348:   PetscScalar                       *AAUp;
349:   PetscInt                          i,nz, nzUpper, offset;
350:   PetscErrorCode                    ierr;

353:   if (!n) return(0);
354:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
355:     try {
356:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
357:       nzUpper = adiag[0]-adiag[n];

359:       /* Allocate Space for the upper triangular matrix */
360:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
361:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
362:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

364:       /* Fill the upper triangular matrix */
365:       AiUp[0]=(PetscInt) 0;
366:       AiUp[n]=nzUpper;
367:       offset = nzUpper;
368:       for (i=n-1; i>=0; i--) {
369:         v  = aa + adiag[i+1] + 1;
370:         vi = aj + adiag[i+1] + 1;

372:         /* number of elements NOT on the diagonal */
373:         nz = adiag[i] - adiag[i+1]-1;

375:         /* decrement the offset */
376:         offset -= (nz+1);

378:         /* first, set the diagonal elements */
379:         AjUp[offset] = (PetscInt) i;
380:         AAUp[offset] = (MatScalar)1./v[nz];
381:         AiUp[i]      = AiUp[i+1] - (nz+1);

383:         PetscArraycpy(&(AjUp[offset+1]), vi, nz);
384:         PetscArraycpy(&(AAUp[offset+1]), v, nz);
385:       }

387:       /* allocate space for the triangular factor information */
388:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

390:       /* Create the matrix description */
391:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
392:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
393:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
394:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
395:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

397:       /* Create the solve analysis information */
398:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

400:       /* set the operation */
401:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

403:       /* set the matrix */
404:       upTriFactor->csrMat = new CsrMatrix;
405:       upTriFactor->csrMat->num_rows = n;
406:       upTriFactor->csrMat->num_cols = n;
407:       upTriFactor->csrMat->num_entries = nzUpper;

409:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
410:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

412:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
413:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

415:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
416:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

418:       /* perform the solve analysis */
419:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
420:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
421:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
422:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

424:       /* assign the pointer. Is this really necessary? */
425:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

427:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
428:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
429:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
430:       PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
431:     } catch(char *ex) {
432:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
433:     }
434:   }
435:   return(0);
436: }

438: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
439: {
440:   PetscErrorCode               ierr;
441:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
442:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
443:   IS                           isrow = a->row,iscol = a->icol;
444:   PetscBool                    row_identity,col_identity;
445:   const PetscInt               *r,*c;
446:   PetscInt                     n = A->rmap->n;

449:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
450:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

452:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
453:   cusparseTriFactors->nnz=a->nz;

455:   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
456:   /*lower triangular indices */
457:   ISGetIndices(isrow,&r);
458:   ISIdentity(isrow,&row_identity);
459:   if (!row_identity) {
460:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
461:     cusparseTriFactors->rpermIndices->assign(r, r+n);
462:   }
463:   ISRestoreIndices(isrow,&r);

465:   /*upper triangular indices */
466:   ISGetIndices(iscol,&c);
467:   ISIdentity(iscol,&col_identity);
468:   if (!col_identity) {
469:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
470:     cusparseTriFactors->cpermIndices->assign(c, c+n);
471:   }

473:   if(!row_identity && !col_identity) {
474:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
475:   } else if(!row_identity) {
476:     PetscLogCpuToGpu(n*sizeof(PetscInt));
477:   } else if(!col_identity) {
478:     PetscLogCpuToGpu(n*sizeof(PetscInt));
479:   }

481:   ISRestoreIndices(iscol,&c);
482:   return(0);
483: }

485: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
486: {
487:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
488:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
489:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
490:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
491:   cusparseStatus_t                  stat;
492:   PetscErrorCode                    ierr;
493:   PetscInt                          *AiUp, *AjUp;
494:   PetscScalar                       *AAUp;
495:   PetscScalar                       *AALo;
496:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
497:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
498:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
499:   const MatScalar                   *aa = b->a,*v;

502:   if (!n) return(0);
503:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504:     try {
505:       /* Allocate Space for the upper triangular matrix */
506:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

511:       /* Fill the upper triangular matrix */
512:       AiUp[0]=(PetscInt) 0;
513:       AiUp[n]=nzUpper;
514:       offset = 0;
515:       for (i=0; i<n; i++) {
516:         /* set the pointers */
517:         v  = aa + ai[i];
518:         vj = aj + ai[i];
519:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

521:         /* first, set the diagonal elements */
522:         AjUp[offset] = (PetscInt) i;
523:         AAUp[offset] = (MatScalar)1.0/v[nz];
524:         AiUp[i]      = offset;
525:         AALo[offset] = (MatScalar)1.0/v[nz];

527:         offset+=1;
528:         if (nz>0) {
529:           PetscArraycpy(&(AjUp[offset]), vj, nz);
530:           PetscArraycpy(&(AAUp[offset]), v, nz);
531:           for (j=offset; j<offset+nz; j++) {
532:             AAUp[j] = -AAUp[j];
533:             AALo[j] = AAUp[j]/v[nz];
534:           }
535:           offset+=nz;
536:         }
537:       }

539:       /* allocate space for the triangular factor information */
540:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

542:       /* Create the matrix description */
543:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

549:       /* Create the solve analysis information */
550:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

552:       /* set the operation */
553:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

555:       /* set the matrix */
556:       upTriFactor->csrMat = new CsrMatrix;
557:       upTriFactor->csrMat->num_rows = A->rmap->n;
558:       upTriFactor->csrMat->num_cols = A->cmap->n;
559:       upTriFactor->csrMat->num_entries = a->nz;

561:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

564:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

567:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

570:       /* perform the solve analysis */
571:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

576:       /* assign the pointer. Is this really necessary? */
577:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

579:       /* allocate space for the triangular factor information */
580:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

582:       /* Create the matrix description */
583:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

589:       /* Create the solve analysis information */
590:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

592:       /* set the operation */
593:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

595:       /* set the matrix */
596:       loTriFactor->csrMat = new CsrMatrix;
597:       loTriFactor->csrMat->num_rows = A->rmap->n;
598:       loTriFactor->csrMat->num_cols = A->cmap->n;
599:       loTriFactor->csrMat->num_entries = a->nz;

601:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

604:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

607:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
609:       PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));

611:       /* perform the solve analysis */
612:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

617:       /* assign the pointer. Is this really necessary? */
618:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

620:       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
625:     } catch(char *ex) {
626:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627:     }
628:   }
629:   return(0);
630: }

632: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
633: {
634:   PetscErrorCode               ierr;
635:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637:   IS                           ip = a->row;
638:   const PetscInt               *rip;
639:   PetscBool                    perm_identity;
640:   PetscInt                     n = A->rmap->n;

643:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
644:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

647:   /*lower triangular indices */
648:   ISGetIndices(ip,&rip);
649:   ISIdentity(ip,&perm_identity);
650:   if (!perm_identity) {
651:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
652:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
653:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
654:     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
655:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
656:   }
657:   ISRestoreIndices(ip,&rip);
658:   return(0);
659: }

661: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
662: {
663:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
664:   IS             isrow = b->row,iscol = b->col;
665:   PetscBool      row_identity,col_identity;

669:   MatLUFactorNumeric_SeqAIJ(B,A,info);
670:   /* determine which version of MatSolve needs to be used. */
671:   ISIdentity(isrow,&row_identity);
672:   ISIdentity(iscol,&col_identity);
673:   if (row_identity && col_identity) {
674:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
675:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
676:   } else {
677:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
678:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
679:   }

681:   /* get the triangular factors */
682:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
683:   return(0);
684: }

686: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
687: {
688:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
689:   IS             ip = b->row;
690:   PetscBool      perm_identity;

694:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

696:   /* determine which version of MatSolve needs to be used. */
697:   ISIdentity(ip,&perm_identity);
698:   if (perm_identity) {
699:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
700:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
701:   } else {
702:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
703:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
704:   }

706:   /* get the triangular factors */
707:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
708:   return(0);
709: }

711: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
712: {
713:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
714:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
715:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
716:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
717:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
718:   cusparseStatus_t                  stat;
719:   cusparseIndexBase_t               indexBase;
720:   cusparseMatrixType_t              matrixType;
721:   cusparseFillMode_t                fillMode;
722:   cusparseDiagType_t                diagType;


726:   /*********************************************/
727:   /* Now the Transpose of the Lower Tri Factor */
728:   /*********************************************/

730:   /* allocate space for the transpose of the lower triangular factor */
731:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

733:   /* set the matrix descriptors of the lower triangular factor */
734:   matrixType = cusparseGetMatType(loTriFactor->descr);
735:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
736:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
737:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
738:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

740:   /* Create the matrix description */
741:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
742:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
743:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
744:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
745:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);

747:   /* Create the solve analysis information */
748:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);

750:   /* set the operation */
751:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

753:   /* allocate GPU space for the CSC of the lower triangular factor*/
754:   loTriFactorT->csrMat = new CsrMatrix;
755:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
756:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
757:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
758:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
759:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
760:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

762:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
763:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
764:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
765:                           loTriFactor->csrMat->values->data().get(),
766:                           loTriFactor->csrMat->row_offsets->data().get(),
767:                           loTriFactor->csrMat->column_indices->data().get(),
768:                           loTriFactorT->csrMat->values->data().get(),
769:                           loTriFactorT->csrMat->column_indices->data().get(),
770:                           loTriFactorT->csrMat->row_offsets->data().get(),
771:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

773:   /* perform the solve analysis on the transposed matrix */
774:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
775:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
776:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
777:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
778:                            loTriFactorT->solveInfo);CHKERRCUDA(stat);

780:   /* assign the pointer. Is this really necessary? */
781:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

783:   /*********************************************/
784:   /* Now the Transpose of the Upper Tri Factor */
785:   /*********************************************/

787:   /* allocate space for the transpose of the upper triangular factor */
788:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

790:   /* set the matrix descriptors of the upper triangular factor */
791:   matrixType = cusparseGetMatType(upTriFactor->descr);
792:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
793:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
794:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
795:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

797:   /* Create the matrix description */
798:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
799:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
800:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
801:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
802:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);

804:   /* Create the solve analysis information */
805:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);

807:   /* set the operation */
808:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

810:   /* allocate GPU space for the CSC of the upper triangular factor*/
811:   upTriFactorT->csrMat = new CsrMatrix;
812:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
813:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
814:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
815:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
816:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
817:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

819:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
820:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
821:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
822:                           upTriFactor->csrMat->values->data().get(),
823:                           upTriFactor->csrMat->row_offsets->data().get(),
824:                           upTriFactor->csrMat->column_indices->data().get(),
825:                           upTriFactorT->csrMat->values->data().get(),
826:                           upTriFactorT->csrMat->column_indices->data().get(),
827:                           upTriFactorT->csrMat->row_offsets->data().get(),
828:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

830:   /* perform the solve analysis on the transposed matrix */
831:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
832:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
833:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
834:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
835:                            upTriFactorT->solveInfo);CHKERRCUDA(stat);

837:   /* assign the pointer. Is this really necessary? */
838:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
839:   return(0);
840: }

842: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
843: {
844:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
845:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
846:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
847:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
848:   cusparseStatus_t             stat;
849:   cusparseIndexBase_t          indexBase;
850:   cudaError_t                  err;
851:   PetscErrorCode               ierr;


855:   /* allocate space for the triangular factor information */
856:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
857:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
858:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
859:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
860:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

862:   /* set alpha and beta */
863:   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
864:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
865:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
866:   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
869:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

871:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
872:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
873:     CsrMatrix *matrixT= new CsrMatrix;
874:     thrust::device_vector<int> *rowoffsets_gpu;
875:     const int ncomprow = matstruct->cprowIndices->size();
876:     thrust::host_vector<int> cprow(ncomprow);
877:     cprow = *matstruct->cprowIndices; // GPU --> CPU
878:     matrixT->num_rows = A->cmap->n;
879:     matrixT->num_cols = A->rmap->n;
880:     matrixT->num_entries = a->nz;
881:     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
882:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
883:     matrixT->values = new THRUSTARRAY(a->nz);
884:     PetscInt k,i;
885:     thrust::host_vector<int> rowst_host(A->rmap->n+1);

887:     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
888:     rowst_host[0] = 0;
889:     for (k = 0, i = 0; i < A->rmap->n ; i++) {
890:       if (k < ncomprow && i==cprow[k]) {
891:         rowst_host[i+1] = a->i[i+1];
892:         k++;
893:       } else rowst_host[i+1] = rowst_host[i];
894:     }
895:     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
896:     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
897:     *rowoffsets_gpu = rowst_host; // CPU --> GPU

899:     /* compute the transpose of the upper triangular factor, i.e. the CSC */
900:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
901:     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
902:                             A->cmap->n, matrix->num_entries,
903:                             matrix->values->data().get(),
904:                             rowoffsets_gpu->data().get(),
905:                             matrix->column_indices->data().get(),
906:                             matrixT->values->data().get(),
907:                             matrixT->column_indices->data().get(),
908:                             matrixT->row_offsets->data().get(),
909:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

911:     /* assign the pointer */
912:     matstructT->mat = matrixT;
913:     PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));
914:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
915:     /* First convert HYB to CSR */
916:     CsrMatrix *temp= new CsrMatrix;
917:     temp->num_rows = A->rmap->n;
918:     temp->num_cols = A->cmap->n;
919:     temp->num_entries = a->nz;
920:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
921:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
922:     temp->values = new THRUSTARRAY(a->nz);


925:     stat = cusparse_hyb2csr(cusparsestruct->handle,
926:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
927:                             temp->values->data().get(),
928:                             temp->row_offsets->data().get(),
929:                             temp->column_indices->data().get());CHKERRCUDA(stat);

931:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
932:     CsrMatrix *tempT= new CsrMatrix;
933:     tempT->num_rows = A->rmap->n;
934:     tempT->num_cols = A->cmap->n;
935:     tempT->num_entries = a->nz;
936:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
937:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
938:     tempT->values = new THRUSTARRAY(a->nz);

940:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
941:                             temp->num_cols, temp->num_entries,
942:                             temp->values->data().get(),
943:                             temp->row_offsets->data().get(),
944:                             temp->column_indices->data().get(),
945:                             tempT->values->data().get(),
946:                             tempT->column_indices->data().get(),
947:                             tempT->row_offsets->data().get(),
948:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

950:     /* Last, convert CSC to HYB */
951:     cusparseHybMat_t hybMat;
952:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
953:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
954:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
955:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
956:                             matstructT->descr, tempT->values->data().get(),
957:                             tempT->row_offsets->data().get(),
958:                             tempT->column_indices->data().get(),
959:                             hybMat, 0, partition);CHKERRCUDA(stat);

961:     /* assign the pointer */
962:     matstructT->mat = hybMat;
963:     PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));

965:     /* delete temporaries */
966:     if (tempT) {
967:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
968:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
969:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
970:       delete (CsrMatrix*) tempT;
971:     }
972:     if (temp) {
973:       if (temp->values) delete (THRUSTARRAY*) temp->values;
974:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
975:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
976:       delete (CsrMatrix*) temp;
977:     }
978:   }
979:   /* assign the compressed row indices */
980:   matstructT->cprowIndices = new THRUSTINTARRAY;
981:   matstructT->cprowIndices->resize(A->cmap->n);
982:   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
983:   /* assign the pointer */
984:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
985:   return(0);
986: }

988: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
989: {
990:   PetscInt                              n = xx->map->n;
991:   const PetscScalar                     *barray;
992:   PetscScalar                           *xarray;
993:   thrust::device_ptr<const PetscScalar> bGPU;
994:   thrust::device_ptr<PetscScalar>       xGPU;
995:   cusparseStatus_t                      stat;
996:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
997:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
998:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
999:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1000:   PetscErrorCode                        ierr;

1003:   /* Analyze the matrix and create the transpose ... on the fly */
1004:   if (!loTriFactorT && !upTriFactorT) {
1005:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1006:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1007:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1008:   }

1010:   /* Get the GPU pointers */
1011:   VecCUDAGetArrayWrite(xx,&xarray);
1012:   VecCUDAGetArrayRead(bb,&barray);
1013:   xGPU = thrust::device_pointer_cast(xarray);
1014:   bGPU = thrust::device_pointer_cast(barray);

1016:   PetscLogGpuTimeBegin();
1017:   /* First, reorder with the row permutation */
1018:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1019:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1020:                xGPU);

1022:   /* First, solve U */
1023:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1024:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1025:                         upTriFactorT->csrMat->values->data().get(),
1026:                         upTriFactorT->csrMat->row_offsets->data().get(),
1027:                         upTriFactorT->csrMat->column_indices->data().get(),
1028:                         upTriFactorT->solveInfo,
1029:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1031:   /* Then, solve L */
1032:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1033:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1034:                         loTriFactorT->csrMat->values->data().get(),
1035:                         loTriFactorT->csrMat->row_offsets->data().get(),
1036:                         loTriFactorT->csrMat->column_indices->data().get(),
1037:                         loTriFactorT->solveInfo,
1038:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1040:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1041:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1042:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1043:                tempGPU->begin());

1045:   /* Copy the temporary to the full solution. */
1046:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1048:   /* restore */
1049:   VecCUDARestoreArrayRead(bb,&barray);
1050:   VecCUDARestoreArrayWrite(xx,&xarray);
1051:   WaitForGPU();CHKERRCUDA(ierr);
1052:   PetscLogGpuTimeEnd();
1053:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1054:   return(0);
1055: }

1057: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1058: {
1059:   const PetscScalar                 *barray;
1060:   PetscScalar                       *xarray;
1061:   cusparseStatus_t                  stat;
1062:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1063:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1064:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1065:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1066:   PetscErrorCode                    ierr;

1069:   /* Analyze the matrix and create the transpose ... on the fly */
1070:   if (!loTriFactorT && !upTriFactorT) {
1071:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1072:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1073:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1074:   }

1076:   /* Get the GPU pointers */
1077:   VecCUDAGetArrayWrite(xx,&xarray);
1078:   VecCUDAGetArrayRead(bb,&barray);

1080:   PetscLogGpuTimeBegin();
1081:   /* First, solve U */
1082:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1083:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1084:                         upTriFactorT->csrMat->values->data().get(),
1085:                         upTriFactorT->csrMat->row_offsets->data().get(),
1086:                         upTriFactorT->csrMat->column_indices->data().get(),
1087:                         upTriFactorT->solveInfo,
1088:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1090:   /* Then, solve L */
1091:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1092:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1093:                         loTriFactorT->csrMat->values->data().get(),
1094:                         loTriFactorT->csrMat->row_offsets->data().get(),
1095:                         loTriFactorT->csrMat->column_indices->data().get(),
1096:                         loTriFactorT->solveInfo,
1097:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1099:   /* restore */
1100:   VecCUDARestoreArrayRead(bb,&barray);
1101:   VecCUDARestoreArrayWrite(xx,&xarray);
1102:   WaitForGPU();CHKERRCUDA(ierr);
1103:   PetscLogGpuTimeEnd();
1104:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1105:   return(0);
1106: }

1108: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1109: {
1110:   const PetscScalar                     *barray;
1111:   PetscScalar                           *xarray;
1112:   thrust::device_ptr<const PetscScalar> bGPU;
1113:   thrust::device_ptr<PetscScalar>       xGPU;
1114:   cusparseStatus_t                      stat;
1115:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1116:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1117:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1118:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1119:   PetscErrorCode                        ierr;


1123:   /* Get the GPU pointers */
1124:   VecCUDAGetArrayWrite(xx,&xarray);
1125:   VecCUDAGetArrayRead(bb,&barray);
1126:   xGPU = thrust::device_pointer_cast(xarray);
1127:   bGPU = thrust::device_pointer_cast(barray);

1129:   PetscLogGpuTimeBegin();
1130:   /* First, reorder with the row permutation */
1131:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1132:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1133:                xGPU);

1135:   /* Next, solve L */
1136:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1137:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1138:                         loTriFactor->csrMat->values->data().get(),
1139:                         loTriFactor->csrMat->row_offsets->data().get(),
1140:                         loTriFactor->csrMat->column_indices->data().get(),
1141:                         loTriFactor->solveInfo,
1142:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1144:   /* Then, solve U */
1145:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1146:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1147:                         upTriFactor->csrMat->values->data().get(),
1148:                         upTriFactor->csrMat->row_offsets->data().get(),
1149:                         upTriFactor->csrMat->column_indices->data().get(),
1150:                         upTriFactor->solveInfo,
1151:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1153:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1154:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1155:                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1156:                tempGPU->begin());

1158:   /* Copy the temporary to the full solution. */
1159:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1161:   VecCUDARestoreArrayRead(bb,&barray);
1162:   VecCUDARestoreArrayWrite(xx,&xarray);
1163:   WaitForGPU();CHKERRCUDA(ierr);
1164:   PetscLogGpuTimeEnd();
1165:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1166:   return(0);
1167: }

1169: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1170: {
1171:   const PetscScalar                 *barray;
1172:   PetscScalar                       *xarray;
1173:   cusparseStatus_t                  stat;
1174:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1175:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1176:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1177:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1178:   PetscErrorCode                    ierr;

1181:   /* Get the GPU pointers */
1182:   VecCUDAGetArrayWrite(xx,&xarray);
1183:   VecCUDAGetArrayRead(bb,&barray);

1185:   PetscLogGpuTimeBegin();
1186:   /* First, solve L */
1187:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1188:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1189:                         loTriFactor->csrMat->values->data().get(),
1190:                         loTriFactor->csrMat->row_offsets->data().get(),
1191:                         loTriFactor->csrMat->column_indices->data().get(),
1192:                         loTriFactor->solveInfo,
1193:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1195:   /* Next, solve U */
1196:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1197:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1198:                         upTriFactor->csrMat->values->data().get(),
1199:                         upTriFactor->csrMat->row_offsets->data().get(),
1200:                         upTriFactor->csrMat->column_indices->data().get(),
1201:                         upTriFactor->solveInfo,
1202:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1204:   VecCUDARestoreArrayRead(bb,&barray);
1205:   VecCUDARestoreArrayWrite(xx,&xarray);
1206:   WaitForGPU();CHKERRCUDA(ierr);
1207:   PetscLogGpuTimeEnd();
1208:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1209:   return(0);
1210: }

1212: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1213: {
1214:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1215:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1216:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1217:   PetscInt                     m = A->rmap->n,*ii,*ridx;
1218:   PetscErrorCode               ierr;
1219:   cusparseStatus_t             stat;
1220:   cudaError_t                  err;

1223:   if (A->pinnedtocpu) return(0);
1224:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1225:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1226:     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1227:       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1228:       /* copy values only */
1229:       matrix->values->assign(a->a, a->a+a->nz);
1230:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1231:     } else {
1232:       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1233:       try {
1234:         cusparsestruct->nonzerorow=0;
1235:         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

1237:         if (a->compressedrow.use) {
1238:           m    = a->compressedrow.nrows;
1239:           ii   = a->compressedrow.i;
1240:           ridx = a->compressedrow.rindex;
1241:         } else {
1242:           /* Forcing compressed row on the GPU */
1243:           int k=0;
1244:           PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1245:           PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1246:           ii[0]=0;
1247:           for (int j = 0; j<m; j++) {
1248:             if ((a->i[j+1]-a->i[j])>0) {
1249:               ii[k]  = a->i[j];
1250:               ridx[k]= j;
1251:               k++;
1252:             }
1253:           }
1254:           ii[cusparsestruct->nonzerorow] = a->nz;
1255:           m = cusparsestruct->nonzerorow;
1256:         }

1258:         /* allocate space for the triangular factor information */
1259:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1260:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1261:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1262:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

1264:         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1265:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1266:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1267:         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1268:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1269:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1270:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

1272:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1273:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1274:           /* set the matrix */
1275:           CsrMatrix *matrix= new CsrMatrix;
1276:           matrix->num_rows = m;
1277:           matrix->num_cols = A->cmap->n;
1278:           matrix->num_entries = a->nz;
1279:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1280:           matrix->row_offsets->assign(ii, ii + m+1);

1282:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1283:           matrix->column_indices->assign(a->j, a->j+a->nz);

1285:           matrix->values = new THRUSTARRAY(a->nz);
1286:           matrix->values->assign(a->a, a->a+a->nz);

1288:           /* assign the pointer */
1289:           matstruct->mat = matrix;

1291:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1292:           CsrMatrix *matrix= new CsrMatrix;
1293:           matrix->num_rows = m;
1294:           matrix->num_cols = A->cmap->n;
1295:           matrix->num_entries = a->nz;
1296:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1297:           matrix->row_offsets->assign(ii, ii + m+1);

1299:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1300:           matrix->column_indices->assign(a->j, a->j+a->nz);

1302:           matrix->values = new THRUSTARRAY(a->nz);
1303:           matrix->values->assign(a->a, a->a+a->nz);

1305:           cusparseHybMat_t hybMat;
1306:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1307:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309:           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1310:               matstruct->descr, matrix->values->data().get(),
1311:               matrix->row_offsets->data().get(),
1312:               matrix->column_indices->data().get(),
1313:               hybMat, 0, partition);CHKERRCUDA(stat);
1314:           /* assign the pointer */
1315:           matstruct->mat = hybMat;

1317:           if (matrix) {
1318:             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1319:             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1320:             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1321:             delete (CsrMatrix*)matrix;
1322:           }
1323:         }

1325:         /* assign the compressed row indices */
1326:         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1327:         matstruct->cprowIndices->assign(ridx,ridx+m);
1328:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1330:         /* assign the pointer */
1331:         cusparsestruct->mat = matstruct;

1333:         if (!a->compressedrow.use) {
1334:           PetscFree(ii);
1335:           PetscFree(ridx);
1336:         }
1337:         cusparsestruct->workVector = new THRUSTARRAY(m);
1338:       } catch(char *ex) {
1339:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1340:       }
1341:       cusparsestruct->nonzerostate = A->nonzerostate;
1342:     }
1343:     WaitForGPU();CHKERRCUDA(ierr);
1344:     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1345:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1346:   }
1347:   return(0);
1348: }

1350: struct VecCUDAPlusEquals
1351: {
1352:   template <typename Tuple>
1353:   __host__ __device__
1354:   void operator()(Tuple t)
1355:   {
1356:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1357:   }
1358: };

1360: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1361: {

1365:   MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1366:   return(0);
1367: }

1369: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1370: {
1371:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1372:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1373:   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1374:   const PetscScalar            *xarray;
1375:   PetscScalar                  *yarray;
1376:   PetscErrorCode               ierr;
1377:   cusparseStatus_t             stat;

1380:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1381:   MatSeqAIJCUSPARSECopyToGPU(A);
1382:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1383:   if (!matstructT) {
1384:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1385:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1386:   }
1387:   VecCUDAGetArrayRead(xx,&xarray);
1388:   VecCUDAGetArrayWrite(yy,&yarray);

1390:   PetscLogGpuTimeBegin();
1391:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1392:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1393:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1394:                              mat->num_rows, mat->num_cols,
1395:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1396:                              mat->values->data().get(), mat->row_offsets->data().get(),
1397:                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1398:                              yarray);CHKERRCUDA(stat);
1399:   } else {
1400:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1401:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1402:                              matstructT->alpha, matstructT->descr, hybMat,
1403:                              xarray, matstructT->beta_zero,
1404:                              yarray);CHKERRCUDA(stat);
1405:   }
1406:   VecCUDARestoreArrayRead(xx,&xarray);
1407:   VecCUDARestoreArrayWrite(yy,&yarray);
1408:   WaitForGPU();CHKERRCUDA(ierr);
1409:   PetscLogGpuTimeEnd();
1410:   PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1411:   return(0);
1412: }


1415: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1416: {
1417:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1418:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1419:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1420:   const PetscScalar            *xarray;
1421:   PetscScalar                  *zarray,*dptr,*beta;
1422:   PetscErrorCode               ierr;
1423:   cusparseStatus_t             stat;
1424:   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */

1427:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1428:   MatSeqAIJCUSPARSECopyToGPU(A);
1429:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1430:   try {
1431:     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1432:     VecCUDAGetArrayRead(xx,&xarray);
1433:     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1434:       VecCUDAGetArray(zz,&zarray);
1435:     } else {
1436:       VecCUDAGetArrayWrite(zz,&zarray);
1437:     }
1438:     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1439:     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;

1441:     PetscLogGpuTimeBegin();
1442:     /* csr_spmv is multiply add */
1443:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1444:       /* here we need to be careful to set the number of rows in the multiply to the
1445:          number of compressed rows in the matrix ... which is equivalent to the
1446:          size of the workVector */
1447:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1448:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1449:                                mat->num_rows, mat->num_cols,
1450:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1451:                                mat->values->data().get(), mat->row_offsets->data().get(),
1452:                                mat->column_indices->data().get(), xarray, beta,
1453:                                dptr);CHKERRCUDA(stat);
1454:     } else {
1455:       if (cusparsestruct->workVector->size()) {
1456:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1457:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1458:                                  matstruct->alpha, matstruct->descr, hybMat,
1459:                                  xarray, beta,
1460:                                  dptr);CHKERRCUDA(stat);
1461:       }
1462:     }
1463:     PetscLogGpuTimeEnd();

1465:     if (yy) {
1466:       if (dptr != zarray) {
1467:         VecCopy_SeqCUDA(yy,zz);
1468:       } else if (zz != yy) {
1469:         VecAXPY_SeqCUDA(zz,1.0,yy);
1470:       }
1471:     } else if (dptr != zarray) {
1472:       VecSet_SeqCUDA(zz,0);
1473:     }
1474:     /* scatter the data from the temporary into the full vector with a += operation */
1475:     PetscLogGpuTimeBegin();
1476:     if (dptr != zarray) {
1477:       thrust::device_ptr<PetscScalar> zptr;

1479:       zptr = thrust::device_pointer_cast(zarray);
1480:       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1481:                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1482:                        VecCUDAPlusEquals());
1483:     }
1484:     PetscLogGpuTimeEnd();
1485:     VecCUDARestoreArrayRead(xx,&xarray);
1486:     if (yy && !cmpr) {
1487:       VecCUDARestoreArray(zz,&zarray);
1488:     } else {
1489:       VecCUDARestoreArrayWrite(zz,&zarray);
1490:     }
1491:   } catch(char *ex) {
1492:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1493:   }
1494:   if (!yy) { /* MatMult */
1495:     if (!cusparsestruct->stream) {
1496:       WaitForGPU();CHKERRCUDA(ierr);
1497:     }
1498:   }
1499:   PetscLogGpuFlops(2.0*a->nz);
1500:   return(0);
1501: }

1503: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1504: {
1505:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1506:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1507:   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1508:   const PetscScalar               *xarray;
1509:   PetscScalar                     *zarray,*dptr,*beta;
1510:   PetscErrorCode                  ierr;
1511:   cusparseStatus_t                stat;

1514:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1515:   MatSeqAIJCUSPARSECopyToGPU(A);
1516:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1517:   if (!matstructT) {
1518:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1519:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1520:   }

1522:   try {
1523:     VecCopy_SeqCUDA(yy,zz);
1524:     VecCUDAGetArrayRead(xx,&xarray);
1525:     VecCUDAGetArray(zz,&zarray);
1526:     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1527:     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;

1529:     PetscLogGpuTimeBegin();
1530:     /* multiply add with matrix transpose */
1531:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1532:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1533:       /* here we need to be careful to set the number of rows in the multiply to the
1534:          number of compressed rows in the matrix ... which is equivalent to the
1535:          size of the workVector */
1536:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1537:                                mat->num_rows, mat->num_cols,
1538:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1539:                                mat->values->data().get(), mat->row_offsets->data().get(),
1540:                                mat->column_indices->data().get(), xarray, beta,
1541:                                dptr);CHKERRCUDA(stat);
1542:     } else {
1543:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1544:       if (cusparsestruct->workVector->size()) {
1545:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1546:                                  matstructT->alpha, matstructT->descr, hybMat,
1547:                                  xarray, beta,
1548:                                  dptr);CHKERRCUDA(stat);
1549:       }
1550:     }
1551:     PetscLogGpuTimeEnd();

1553:     if (dptr != zarray) {
1554:       VecCopy_SeqCUDA(yy,zz);
1555:     } else if (zz != yy) {
1556:       VecAXPY_SeqCUDA(zz,1.0,yy);
1557:     }
1558:     /* scatter the data from the temporary into the full vector with a += operation */
1559:     PetscLogGpuTimeBegin();
1560:     if (dptr != zarray) {
1561:       thrust::device_ptr<PetscScalar> zptr;

1563:       zptr = thrust::device_pointer_cast(zarray);

1565:       /* scatter the data from the temporary into the full vector with a += operation */
1566:       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1567:                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1568:                        VecCUDAPlusEquals());
1569:     }
1570:     PetscLogGpuTimeEnd();
1571:     VecCUDARestoreArrayRead(xx,&xarray);
1572:     VecCUDARestoreArray(zz,&zarray);
1573:   } catch(char *ex) {
1574:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1575:   }
1576:   WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1577:   PetscLogGpuFlops(2.0*a->nz);
1578:   return(0);
1579: }

1581: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1582: {

1586:   MatAssemblyEnd_SeqAIJ(A,mode);
1587:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1588:   if (A->factortype == MAT_FACTOR_NONE) {
1589:     MatSeqAIJCUSPARSECopyToGPU(A);
1590:   }
1591:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1592:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1593:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1594:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1595:   return(0);
1596: }

1598: /* --------------------------------------------------------------------------------*/
1599: /*@
1600:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1601:    (the default parallel PETSc format). This matrix will ultimately pushed down
1602:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1603:    assembly performance the user should preallocate the matrix storage by setting
1604:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1605:    performance during matrix assembly can be increased by more than a factor of 50.

1607:    Collective

1609:    Input Parameters:
1610: +  comm - MPI communicator, set to PETSC_COMM_SELF
1611: .  m - number of rows
1612: .  n - number of columns
1613: .  nz - number of nonzeros per row (same for all rows)
1614: -  nnz - array containing the number of nonzeros in the various rows
1615:          (possibly different for each row) or NULL

1617:    Output Parameter:
1618: .  A - the matrix

1620:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1621:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1622:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1624:    Notes:
1625:    If nnz is given then nz is ignored

1627:    The AIJ format (also called the Yale sparse matrix format or
1628:    compressed row storage), is fully compatible with standard Fortran 77
1629:    storage.  That is, the stored row and column indices can begin at
1630:    either one (as in Fortran) or zero.  See the users' manual for details.

1632:    Specify the preallocated storage with either nz or nnz (not both).
1633:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1634:    allocation.  For large problems you MUST preallocate memory or you
1635:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1637:    By default, this format uses inodes (identical nodes) when possible, to
1638:    improve numerical efficiency of matrix-vector products and solves. We
1639:    search for consecutive rows with the same nonzero structure, thereby
1640:    reusing matrix information to achieve increased efficiency.

1642:    Level: intermediate

1644: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1645: @*/
1646: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1647: {

1651:   MatCreate(comm,A);
1652:   MatSetSizes(*A,m,n,m,n);
1653:   MatSetType(*A,MATSEQAIJCUSPARSE);
1654:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1655:   return(0);
1656: }

1658: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1659: {
1660:   PetscErrorCode   ierr;

1663:   if (A->factortype==MAT_FACTOR_NONE) {
1664:     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1665:       MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1666:     }
1667:   } else {
1668:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1669:   }
1670:   MatDestroy_SeqAIJ(A);
1671:   return(0);
1672: }

1674: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1675: {
1677:   Mat C;
1678:   cusparseStatus_t stat;
1679:   cusparseHandle_t handle=0;

1682:   MatDuplicate_SeqAIJ(A,cpvalues,B);
1683:   C    = *B;
1684:   PetscFree(C->defaultvectype);
1685:   PetscStrallocpy(VECCUDA,&C->defaultvectype);

1687:   /* inject CUSPARSE-specific stuff */
1688:   if (C->factortype==MAT_FACTOR_NONE) {
1689:     /* you cannot check the inode.use flag here since the matrix was just created.
1690:        now build a GPU matrix data structure */
1691:     C->spptr = new Mat_SeqAIJCUSPARSE;
1692:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1693:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1694:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1695:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1696:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1697:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1698:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1699:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1700:   } else {
1701:     /* NEXT, set the pointers to the triangular factors */
1702:     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1703:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1704:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1705:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1706:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1707:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1708:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1709:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1710:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1711:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1712:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1713:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1714:   }

1716:   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1717:   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1718:   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1719:   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1720:   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1721:   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1722:   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1723:   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1725:   PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);

1727:   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

1729:   PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1730:   return(0);
1731: }

1733: PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1734: {
1736:   cusparseStatus_t stat;
1737:   cusparseHandle_t handle=0;

1740:   PetscFree(B->defaultvectype);
1741:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

1743:   if (B->factortype==MAT_FACTOR_NONE) {
1744:     /* you cannot check the inode.use flag here since the matrix was just created.
1745:        now build a GPU matrix data structure */
1746:     B->spptr = new Mat_SeqAIJCUSPARSE;
1747:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1748:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1749:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1750:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1751:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1752:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1753:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1754:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1755:   } else {
1756:     /* NEXT, set the pointers to the triangular factors */
1757:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1758:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1759:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1760:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1761:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1762:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1763:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1764:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1765:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1766:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1767:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1768:   }

1770:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1771:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1772:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1773:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1774:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1775:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1776:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1777:   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1779:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1781:   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

1783:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1784:   return(0);
1785: }

1787: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1788: {

1792:   MatCreate_SeqAIJ(B);
1793:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1794:   return(0);
1795: }

1797: /*MC
1798:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1800:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1801:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1802:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

1804:    Options Database Keys:
1805: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1806: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1807: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

1809:   Level: beginner

1811: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1812: M*/

1814: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


1817: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1818: {

1822:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1823:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1824:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1825:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1826:   return(0);
1827: }


1830: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1831: {
1832:   cusparseStatus_t stat;
1833:   cusparseHandle_t handle;

1836:   if (*cusparsestruct) {
1837:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1838:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1839:     delete (*cusparsestruct)->workVector;
1840:     if (handle = (*cusparsestruct)->handle) {
1841:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1842:     }
1843:     delete *cusparsestruct;
1844:     *cusparsestruct = 0;
1845:   }
1846:   return(0);
1847: }

1849: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1850: {
1852:   if (*mat) {
1853:     delete (*mat)->values;
1854:     delete (*mat)->column_indices;
1855:     delete (*mat)->row_offsets;
1856:     delete *mat;
1857:     *mat = 0;
1858:   }
1859:   return(0);
1860: }

1862: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1863: {
1864:   cusparseStatus_t stat;
1865:   PetscErrorCode   ierr;

1868:   if (*trifactor) {
1869:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1870:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1871:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
1872:     delete *trifactor;
1873:     *trifactor = 0;
1874:   }
1875:   return(0);
1876: }

1878: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1879: {
1880:   CsrMatrix        *mat;
1881:   cusparseStatus_t stat;
1882:   cudaError_t      err;

1885:   if (*matstruct) {
1886:     if ((*matstruct)->mat) {
1887:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1888:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1889:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1890:       } else {
1891:         mat = (CsrMatrix*)(*matstruct)->mat;
1892:         CsrMatrix_Destroy(&mat);
1893:       }
1894:     }
1895:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1896:     delete (*matstruct)->cprowIndices;
1897:     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1898:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1899:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1900:     delete *matstruct;
1901:     *matstruct = 0;
1902:   }
1903:   return(0);
1904: }

1906: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1907: {
1908:   cusparseHandle_t handle;
1909:   cusparseStatus_t stat;

1912:   if (*trifactors) {
1913:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1914:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1915:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1916:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1917:     delete (*trifactors)->rpermIndices;
1918:     delete (*trifactors)->cpermIndices;
1919:     delete (*trifactors)->workVector;
1920:     if (handle = (*trifactors)->handle) {
1921:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1922:     }
1923:     delete *trifactors;
1924:     *trifactors = 0;
1925:   }
1926:   return(0);
1927: }