Actual source code: aijcusparse.cu

petsc-master 2019-07-20
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <../src/mat/impls/aij/seq/aij.h>
  9:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10:  #include <../src/vec/vec/impls/dvecimpl.h>
 11:  #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13:  #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 15: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};

 17: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 18: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 19: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 21: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 22: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 23: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 25: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 26: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 27: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 28: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 29: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 30: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 31: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 32: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 33: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);

 35: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 36: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 37: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 38: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 39: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 41: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 42: {
 43:   cusparseStatus_t   stat;
 44:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 47:   cusparsestruct->stream = stream;
 48:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
 49:   return(0);
 50: }

 52: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 53: {
 54:   cusparseStatus_t   stat;
 55:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 58:   if (cusparsestruct->handle != handle) {
 59:     if (cusparsestruct->handle) {
 60:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
 61:     }
 62:     cusparsestruct->handle = handle;
 63:   }
 64:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
 65:   return(0);
 66: }

 68: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
 69: {
 70:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
 72:   if (cusparsestruct->handle)
 73:     cusparsestruct->handle = 0;
 74:   return(0);
 75: }

 77: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
 78: {
 80:   *type = MATSOLVERCUSPARSE;
 81:   return(0);
 82: }

 84: /*MC
 85:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
 86:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
 87:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
 88:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
 89:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
 90:   algorithms are not recommended. This class does NOT support direct solver operations.

 92:   Level: beginner

 94: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
 95: M*/

 97: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
 98: {
100:   PetscInt       n = A->rmap->n;

103:   MatCreate(PetscObjectComm((PetscObject)A),B);
104:   (*B)->factortype = ftype;
105:   MatSetSizes(*B,n,n,n,n);
106:   MatSetType(*B,MATSEQAIJCUSPARSE);

108:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109:     MatSetBlockSizesFromMats(*B,A,A);
110:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

117:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
118:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
119:   return(0);
120: }

122: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123: {
124:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

127: #if CUDA_VERSION>=4020
128:   switch (op) {
129:   case MAT_CUSPARSE_MULT:
130:     cusparsestruct->format = format;
131:     break;
132:   case MAT_CUSPARSE_ALL:
133:     cusparsestruct->format = format;
134:     break;
135:   default:
136:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137:   }
138: #else
139:   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140: #endif
141:   return(0);
142: }

144: /*@
145:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146:    operation. Only the MatMult operation can use different GPU storage formats
147:    for MPIAIJCUSPARSE matrices.
148:    Not Collective

150:    Input Parameters:
151: +  A - Matrix of type SEQAIJCUSPARSE
152: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

155:    Output Parameter:

157:    Level: intermediate

159: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160: @*/
161: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162: {

167:   PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
168:   return(0);
169: }

171: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172: {
173:   PetscErrorCode           ierr;
174:   MatCUSPARSEStorageFormat format;
175:   PetscBool                flg;
176:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

179:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
180:   if (A->factortype==MAT_FACTOR_NONE) {
181:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
183:     if (flg) {
184:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
185:     }
186:   }
187:   PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188:                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
189:   if (flg) {
190:     MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
191:   }
192:   PetscOptionsTail();
193:   return(0);

195: }

197: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198: {

202:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
203:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204:   return(0);
205: }

207: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208: {

212:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
213:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214:   return(0);
215: }

217: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218: {

222:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
223:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224:   return(0);
225: }

227: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228: {

232:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
233:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234:   return(0);
235: }

237: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238: {
239:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240:   PetscInt                          n = A->rmap->n;
241:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243:   cusparseStatus_t                  stat;
244:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245:   const MatScalar                   *aa = a->a,*v;
246:   PetscInt                          *AiLo, *AjLo;
247:   PetscScalar                       *AALo;
248:   PetscInt                          i,nz, nzLower, offset, rowOffset;
249:   PetscErrorCode                    ierr;

252:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253:     try {
254:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255:       nzLower=n+ai[n]-ai[1];

257:       /* Allocate Space for the lower triangular matrix */
258:       cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259:       cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260:       cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);

262:       /* Fill the lower triangular matrix */
263:       AiLo[0]  = (PetscInt) 0;
264:       AiLo[n]  = nzLower;
265:       AjLo[0]  = (PetscInt) 0;
266:       AALo[0]  = (MatScalar) 1.0;
267:       v        = aa;
268:       vi       = aj;
269:       offset   = 1;
270:       rowOffset= 1;
271:       for (i=1; i<n; i++) {
272:         nz = ai[i+1] - ai[i];
273:         /* additional 1 for the term on the diagonal */
274:         AiLo[i]    = rowOffset;
275:         rowOffset += nz+1;

277:         PetscArraycpy(&(AjLo[offset]), vi, nz);
278:         PetscArraycpy(&(AALo[offset]), v, nz);

280:         offset      += nz;
281:         AjLo[offset] = (PetscInt) i;
282:         AALo[offset] = (MatScalar) 1.0;
283:         offset      += 1;

285:         v  += nz;
286:         vi += nz;
287:       }

289:       /* allocate space for the triangular factor information */
290:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

292:       /* Create the matrix description */
293:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

299:       /* Create the solve analysis information */
300:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

302:       /* set the operation */
303:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

305:       /* set the matrix */
306:       loTriFactor->csrMat = new CsrMatrix;
307:       loTriFactor->csrMat->num_rows = n;
308:       loTriFactor->csrMat->num_cols = n;
309:       loTriFactor->csrMat->num_entries = nzLower;

311:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

314:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

317:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

320:       /* perform the solve analysis */
321:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

326:       /* assign the pointer. Is this really necessary? */
327:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

329:       cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330:       cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
332:       PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
333:     } catch(char *ex) {
334:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335:     }
336:   }
337:   return(0);
338: }

340: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341: {
342:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343:   PetscInt                          n = A->rmap->n;
344:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346:   cusparseStatus_t                  stat;
347:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348:   const MatScalar                   *aa = a->a,*v;
349:   PetscInt                          *AiUp, *AjUp;
350:   PetscScalar                       *AAUp;
351:   PetscInt                          i,nz, nzUpper, offset;
352:   PetscErrorCode                    ierr;

355:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356:     try {
357:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358:       nzUpper = adiag[0]-adiag[n];

360:       /* Allocate Space for the upper triangular matrix */
361:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

365:       /* Fill the upper triangular matrix */
366:       AiUp[0]=(PetscInt) 0;
367:       AiUp[n]=nzUpper;
368:       offset = nzUpper;
369:       for (i=n-1; i>=0; i--) {
370:         v  = aa + adiag[i+1] + 1;
371:         vi = aj + adiag[i+1] + 1;

373:         /* number of elements NOT on the diagonal */
374:         nz = adiag[i] - adiag[i+1]-1;

376:         /* decrement the offset */
377:         offset -= (nz+1);

379:         /* first, set the diagonal elements */
380:         AjUp[offset] = (PetscInt) i;
381:         AAUp[offset] = (MatScalar)1./v[nz];
382:         AiUp[i]      = AiUp[i+1] - (nz+1);

384:         PetscArraycpy(&(AjUp[offset+1]), vi, nz);
385:         PetscArraycpy(&(AAUp[offset+1]), v, nz);
386:       }

388:       /* allocate space for the triangular factor information */
389:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

391:       /* Create the matrix description */
392:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

398:       /* Create the solve analysis information */
399:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

401:       /* set the operation */
402:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

404:       /* set the matrix */
405:       upTriFactor->csrMat = new CsrMatrix;
406:       upTriFactor->csrMat->num_rows = n;
407:       upTriFactor->csrMat->num_cols = n;
408:       upTriFactor->csrMat->num_entries = nzUpper;

410:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

413:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

416:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

419:       /* perform the solve analysis */
420:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

425:       /* assign the pointer. Is this really necessary? */
426:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

428:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431:       PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
432:     } catch(char *ex) {
433:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434:     }
435:   }
436:   return(0);
437: }

439: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440: {
441:   PetscErrorCode               ierr;
442:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444:   IS                           isrow = a->row,iscol = a->icol;
445:   PetscBool                    row_identity,col_identity;
446:   const PetscInt               *r,*c;
447:   PetscInt                     n = A->rmap->n;

450:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
451:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

453:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454:   cusparseTriFactors->nnz=a->nz;

456:   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457:   /*lower triangular indices */
458:   ISGetIndices(isrow,&r);
459:   ISIdentity(isrow,&row_identity);
460:   if (!row_identity) {
461:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462:     cusparseTriFactors->rpermIndices->assign(r, r+n);
463:   }
464:   ISRestoreIndices(isrow,&r);

466:   /*upper triangular indices */
467:   ISGetIndices(iscol,&c);
468:   ISIdentity(iscol,&col_identity);
469:   if (!col_identity) {
470:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471:     cusparseTriFactors->cpermIndices->assign(c, c+n);
472:   }

474:   if(!row_identity && !col_identity) {
475:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
476:   } else if(!row_identity) {
477:     PetscLogCpuToGpu(n*sizeof(PetscInt));
478:   } else if(!col_identity) {
479:     PetscLogCpuToGpu(n*sizeof(PetscInt));
480:   }

482:   ISRestoreIndices(iscol,&c);
483:   return(0);
484: }

486: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487: {
488:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492:   cusparseStatus_t                  stat;
493:   PetscErrorCode                    ierr;
494:   PetscInt                          *AiUp, *AjUp;
495:   PetscScalar                       *AAUp;
496:   PetscScalar                       *AALo;
497:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500:   const MatScalar                   *aa = b->a,*v;

503:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504:     try {
505:       /* Allocate Space for the upper triangular matrix */
506:       cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507:       cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508:       cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509:       cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);

511:       /* Fill the upper triangular matrix */
512:       AiUp[0]=(PetscInt) 0;
513:       AiUp[n]=nzUpper;
514:       offset = 0;
515:       for (i=0; i<n; i++) {
516:         /* set the pointers */
517:         v  = aa + ai[i];
518:         vj = aj + ai[i];
519:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

521:         /* first, set the diagonal elements */
522:         AjUp[offset] = (PetscInt) i;
523:         AAUp[offset] = (MatScalar)1.0/v[nz];
524:         AiUp[i]      = offset;
525:         AALo[offset] = (MatScalar)1.0/v[nz];

527:         offset+=1;
528:         if (nz>0) {
529:           PetscArraycpy(&(AjUp[offset]), vj, nz);
530:           PetscArraycpy(&(AAUp[offset]), v, nz);
531:           for (j=offset; j<offset+nz; j++) {
532:             AAUp[j] = -AAUp[j];
533:             AALo[j] = AAUp[j]/v[nz];
534:           }
535:           offset+=nz;
536:         }
537:       }

539:       /* allocate space for the triangular factor information */
540:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

542:       /* Create the matrix description */
543:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);

549:       /* Create the solve analysis information */
550:       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);

552:       /* set the operation */
553:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

555:       /* set the matrix */
556:       upTriFactor->csrMat = new CsrMatrix;
557:       upTriFactor->csrMat->num_rows = A->rmap->n;
558:       upTriFactor->csrMat->num_cols = A->cmap->n;
559:       upTriFactor->csrMat->num_entries = a->nz;

561:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

564:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

567:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

570:       /* perform the solve analysis */
571:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);

576:       /* assign the pointer. Is this really necessary? */
577:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

579:       /* allocate space for the triangular factor information */
580:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

582:       /* Create the matrix description */
583:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);

589:       /* Create the solve analysis information */
590:       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);

592:       /* set the operation */
593:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

595:       /* set the matrix */
596:       loTriFactor->csrMat = new CsrMatrix;
597:       loTriFactor->csrMat->num_rows = A->rmap->n;
598:       loTriFactor->csrMat->num_cols = A->cmap->n;
599:       loTriFactor->csrMat->num_entries = a->nz;

601:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

604:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

607:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
609:       PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));

611:       /* perform the solve analysis */
612:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);

617:       /* assign the pointer. Is this really necessary? */
618:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

620:       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621:       cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622:       cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623:       cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624:       cudaFreeHost(AALo);CHKERRCUDA(ierr);
625:     } catch(char *ex) {
626:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627:     }
628:   }
629:   return(0);
630: }

632: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
633: {
634:   PetscErrorCode               ierr;
635:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637:   IS                           ip = a->row;
638:   const PetscInt               *rip;
639:   PetscBool                    perm_identity;
640:   PetscInt                     n = A->rmap->n;

643:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
644:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

647:   /*lower triangular indices */
648:   ISGetIndices(ip,&rip);
649:   ISIdentity(ip,&perm_identity);
650:   if (!perm_identity) {
651:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
652:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
653:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
654:     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
655:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
656:   }
657:   ISRestoreIndices(ip,&rip);
658:   return(0);
659: }

661: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
662: {
663:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
664:   IS             isrow = b->row,iscol = b->col;
665:   PetscBool      row_identity,col_identity;

669:   MatLUFactorNumeric_SeqAIJ(B,A,info);
670:   /* determine which version of MatSolve needs to be used. */
671:   ISIdentity(isrow,&row_identity);
672:   ISIdentity(iscol,&col_identity);
673:   if (row_identity && col_identity) {
674:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
675:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
676:   } else {
677:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
678:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
679:   }

681:   /* get the triangular factors */
682:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
683:   return(0);
684: }

686: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
687: {
688:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
689:   IS             ip = b->row;
690:   PetscBool      perm_identity;

694:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);

696:   /* determine which version of MatSolve needs to be used. */
697:   ISIdentity(ip,&perm_identity);
698:   if (perm_identity) {
699:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
700:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
701:   } else {
702:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
703:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
704:   }

706:   /* get the triangular factors */
707:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
708:   return(0);
709: }

711: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
712: {
713:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
714:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
715:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
716:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
717:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
718:   cusparseStatus_t                  stat;
719:   cusparseIndexBase_t               indexBase;
720:   cusparseMatrixType_t              matrixType;
721:   cusparseFillMode_t                fillMode;
722:   cusparseDiagType_t                diagType;


726:   /*********************************************/
727:   /* Now the Transpose of the Lower Tri Factor */
728:   /*********************************************/

730:   /* allocate space for the transpose of the lower triangular factor */
731:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

733:   /* set the matrix descriptors of the lower triangular factor */
734:   matrixType = cusparseGetMatType(loTriFactor->descr);
735:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
736:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
737:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
738:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

740:   /* Create the matrix description */
741:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
742:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
743:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
744:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
745:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);

747:   /* Create the solve analysis information */
748:   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);

750:   /* set the operation */
751:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

753:   /* allocate GPU space for the CSC of the lower triangular factor*/
754:   loTriFactorT->csrMat = new CsrMatrix;
755:   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
756:   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
757:   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
758:   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
759:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
760:   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);

762:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
763:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
764:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
765:                           loTriFactor->csrMat->values->data().get(),
766:                           loTriFactor->csrMat->row_offsets->data().get(),
767:                           loTriFactor->csrMat->column_indices->data().get(),
768:                           loTriFactorT->csrMat->values->data().get(),
769:                           loTriFactorT->csrMat->column_indices->data().get(),
770:                           loTriFactorT->csrMat->row_offsets->data().get(),
771:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

773:   /* perform the solve analysis on the transposed matrix */
774:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
775:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
776:                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
777:                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
778:                            loTriFactorT->solveInfo);CHKERRCUDA(stat);

780:   /* assign the pointer. Is this really necessary? */
781:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

783:   /*********************************************/
784:   /* Now the Transpose of the Upper Tri Factor */
785:   /*********************************************/

787:   /* allocate space for the transpose of the upper triangular factor */
788:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

790:   /* set the matrix descriptors of the upper triangular factor */
791:   matrixType = cusparseGetMatType(upTriFactor->descr);
792:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
793:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
794:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
795:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

797:   /* Create the matrix description */
798:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
799:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
800:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
801:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
802:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);

804:   /* Create the solve analysis information */
805:   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);

807:   /* set the operation */
808:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

810:   /* allocate GPU space for the CSC of the upper triangular factor*/
811:   upTriFactorT->csrMat = new CsrMatrix;
812:   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
813:   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
814:   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
815:   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
816:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
817:   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);

819:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
820:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
821:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
822:                           upTriFactor->csrMat->values->data().get(),
823:                           upTriFactor->csrMat->row_offsets->data().get(),
824:                           upTriFactor->csrMat->column_indices->data().get(),
825:                           upTriFactorT->csrMat->values->data().get(),
826:                           upTriFactorT->csrMat->column_indices->data().get(),
827:                           upTriFactorT->csrMat->row_offsets->data().get(),
828:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

830:   /* perform the solve analysis on the transposed matrix */
831:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
832:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
833:                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
834:                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
835:                            upTriFactorT->solveInfo);CHKERRCUDA(stat);

837:   /* assign the pointer. Is this really necessary? */
838:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
839:   return(0);
840: }

842: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
843: {
844:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
845:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
846:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
847:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
848:   cusparseStatus_t             stat;
849:   cusparseIndexBase_t          indexBase;
850:   cudaError_t                  err;
851:   PetscErrorCode               ierr;


855:   /* allocate space for the triangular factor information */
856:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
857:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
858:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
859:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
860:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

862:   /* set alpha and beta */
863:   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
864:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
865:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
866:   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
869:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

871:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
872:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
873:     CsrMatrix *matrixT= new CsrMatrix;
874:     matrixT->num_rows = A->cmap->n;
875:     matrixT->num_cols = A->rmap->n;
876:     matrixT->num_entries = a->nz;
877:     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
878:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
879:     matrixT->values = new THRUSTARRAY(a->nz);

881:     /* compute the transpose of the upper triangular factor, i.e. the CSC */
882:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
883:     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
884:                             matrix->num_cols, matrix->num_entries,
885:                             matrix->values->data().get(),
886:                             matrix->row_offsets->data().get(),
887:                             matrix->column_indices->data().get(),
888:                             matrixT->values->data().get(),
889:                             matrixT->column_indices->data().get(),
890:                             matrixT->row_offsets->data().get(),
891:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

893:     /* assign the pointer */
894:     matstructT->mat = matrixT;
895:     PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));
896:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
897: #if CUDA_VERSION>=5000
898:     /* First convert HYB to CSR */
899:     CsrMatrix *temp= new CsrMatrix;
900:     temp->num_rows = A->rmap->n;
901:     temp->num_cols = A->cmap->n;
902:     temp->num_entries = a->nz;
903:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
905:     temp->values = new THRUSTARRAY(a->nz);


908:     stat = cusparse_hyb2csr(cusparsestruct->handle,
909:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
910:                             temp->values->data().get(),
911:                             temp->row_offsets->data().get(),
912:                             temp->column_indices->data().get());CHKERRCUDA(stat);

914:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
915:     CsrMatrix *tempT= new CsrMatrix;
916:     tempT->num_rows = A->rmap->n;
917:     tempT->num_cols = A->cmap->n;
918:     tempT->num_entries = a->nz;
919:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
921:     tempT->values = new THRUSTARRAY(a->nz);

923:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
924:                             temp->num_cols, temp->num_entries,
925:                             temp->values->data().get(),
926:                             temp->row_offsets->data().get(),
927:                             temp->column_indices->data().get(),
928:                             tempT->values->data().get(),
929:                             tempT->column_indices->data().get(),
930:                             tempT->row_offsets->data().get(),
931:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);

933:     /* Last, convert CSC to HYB */
934:     cusparseHybMat_t hybMat;
935:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
936:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
937:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
938:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
939:                             matstructT->descr, tempT->values->data().get(),
940:                             tempT->row_offsets->data().get(),
941:                             tempT->column_indices->data().get(),
942:                             hybMat, 0, partition);CHKERRCUDA(stat);

944:     /* assign the pointer */
945:     matstructT->mat = hybMat;
946:     PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));

948:     /* delete temporaries */
949:     if (tempT) {
950:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
951:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
952:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
953:       delete (CsrMatrix*) tempT;
954:     }
955:     if (temp) {
956:       if (temp->values) delete (THRUSTARRAY*) temp->values;
957:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
958:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
959:       delete (CsrMatrix*) temp;
960:     }
961: #else
962:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
963: #endif
964:   }
965:   /* assign the compressed row indices */
966:   matstructT->cprowIndices = new THRUSTINTARRAY;
967:   matstructT->cprowIndices->resize(A->cmap->n);
968:   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
969:   /* assign the pointer */
970:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
971:   return(0);
972: }

974: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
975: {
976:   PetscInt                              n = xx->map->n;
977:   const PetscScalar                     *barray;
978:   PetscScalar                           *xarray;
979:   thrust::device_ptr<const PetscScalar> bGPU;
980:   thrust::device_ptr<PetscScalar>       xGPU;
981:   cusparseStatus_t                      stat;
982:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
983:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
984:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
985:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
986:   PetscErrorCode                        ierr;

989:   /* Analyze the matrix and create the transpose ... on the fly */
990:   if (!loTriFactorT && !upTriFactorT) {
991:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
992:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
993:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
994:   }

996:   /* Get the GPU pointers */
997:   VecCUDAGetArrayWrite(xx,&xarray);
998:   VecCUDAGetArrayRead(bb,&barray);
999:   xGPU = thrust::device_pointer_cast(xarray);
1000:   bGPU = thrust::device_pointer_cast(barray);

1002:   PetscLogGpuTimeBegin();
1003:   /* First, reorder with the row permutation */
1004:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1005:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1006:                xGPU);

1008:   /* First, solve U */
1009:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1010:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1011:                         upTriFactorT->csrMat->values->data().get(),
1012:                         upTriFactorT->csrMat->row_offsets->data().get(),
1013:                         upTriFactorT->csrMat->column_indices->data().get(),
1014:                         upTriFactorT->solveInfo,
1015:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1017:   /* Then, solve L */
1018:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1019:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1020:                         loTriFactorT->csrMat->values->data().get(),
1021:                         loTriFactorT->csrMat->row_offsets->data().get(),
1022:                         loTriFactorT->csrMat->column_indices->data().get(),
1023:                         loTriFactorT->solveInfo,
1024:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);

1026:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1027:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1028:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1029:                tempGPU->begin());

1031:   /* Copy the temporary to the full solution. */
1032:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1033:   PetscLogGpuTimeEnd();

1035:   /* restore */
1036:   VecCUDARestoreArrayRead(bb,&barray);
1037:   VecCUDARestoreArrayWrite(xx,&xarray);
1038:   WaitForGPU();CHKERRCUDA(ierr);
1039:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1040:   return(0);
1041: }

1043: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1044: {
1045:   const PetscScalar                 *barray;
1046:   PetscScalar                       *xarray;
1047:   cusparseStatus_t                  stat;
1048:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1049:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1050:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1051:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1052:   PetscErrorCode                    ierr;

1055:   /* Analyze the matrix and create the transpose ... on the fly */
1056:   if (!loTriFactorT && !upTriFactorT) {
1057:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1058:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1059:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1060:   }

1062:   /* Get the GPU pointers */
1063:   VecCUDAGetArrayWrite(xx,&xarray);
1064:   VecCUDAGetArrayRead(bb,&barray);

1066:   PetscLogGpuTimeBegin();
1067:   /* First, solve U */
1068:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1069:                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1070:                         upTriFactorT->csrMat->values->data().get(),
1071:                         upTriFactorT->csrMat->row_offsets->data().get(),
1072:                         upTriFactorT->csrMat->column_indices->data().get(),
1073:                         upTriFactorT->solveInfo,
1074:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1076:   /* Then, solve L */
1077:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1078:                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1079:                         loTriFactorT->csrMat->values->data().get(),
1080:                         loTriFactorT->csrMat->row_offsets->data().get(),
1081:                         loTriFactorT->csrMat->column_indices->data().get(),
1082:                         loTriFactorT->solveInfo,
1083:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1084:   PetscLogGpuTimeEnd();

1086:   /* restore */
1087:   VecCUDARestoreArrayRead(bb,&barray);
1088:   VecCUDARestoreArrayWrite(xx,&xarray);
1089:   WaitForGPU();CHKERRCUDA(ierr);
1090:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1091:   return(0);
1092: }

1094: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1095: {
1096:   const PetscScalar                     *barray;
1097:   PetscScalar                           *xarray;
1098:   thrust::device_ptr<const PetscScalar> bGPU;
1099:   thrust::device_ptr<PetscScalar>       xGPU;
1100:   cusparseStatus_t                      stat;
1101:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1102:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1103:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1104:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1105:   PetscErrorCode                        ierr;


1109:   /* Get the GPU pointers */
1110:   VecCUDAGetArrayWrite(xx,&xarray);
1111:   VecCUDAGetArrayRead(bb,&barray);
1112:   xGPU = thrust::device_pointer_cast(xarray);
1113:   bGPU = thrust::device_pointer_cast(barray);

1115:   PetscLogGpuTimeBegin();
1116:   /* First, reorder with the row permutation */
1117:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1118:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1119:                xGPU);

1121:   /* Next, solve L */
1122:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1123:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1124:                         loTriFactor->csrMat->values->data().get(),
1125:                         loTriFactor->csrMat->row_offsets->data().get(),
1126:                         loTriFactor->csrMat->column_indices->data().get(),
1127:                         loTriFactor->solveInfo,
1128:                         xarray, tempGPU->data().get());CHKERRCUDA(stat);

1130:   /* Then, solve U */
1131:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1132:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1133:                         upTriFactor->csrMat->values->data().get(),
1134:                         upTriFactor->csrMat->row_offsets->data().get(),
1135:                         upTriFactor->csrMat->column_indices->data().get(),
1136:                         upTriFactor->solveInfo,
1137:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1138:   PetscLogGpuTimeEnd();

1140:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1141:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1142:                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1143:                tempGPU->begin());

1145:   /* Copy the temporary to the full solution. */
1146:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1148:   VecCUDARestoreArrayRead(bb,&barray);
1149:   VecCUDARestoreArrayWrite(xx,&xarray);
1150:   WaitForGPU();CHKERRCUDA(ierr);
1151:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1152:   return(0);
1153: }

1155: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1156: {
1157:   const PetscScalar                 *barray;
1158:   PetscScalar                       *xarray;
1159:   cusparseStatus_t                  stat;
1160:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1161:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1162:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1163:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1164:   PetscErrorCode                    ierr;

1167:   /* Get the GPU pointers */
1168:   VecCUDAGetArrayWrite(xx,&xarray);
1169:   VecCUDAGetArrayRead(bb,&barray);

1171:   PetscLogGpuTimeBegin();
1172:   /* First, solve L */
1173:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1174:                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1175:                         loTriFactor->csrMat->values->data().get(),
1176:                         loTriFactor->csrMat->row_offsets->data().get(),
1177:                         loTriFactor->csrMat->column_indices->data().get(),
1178:                         loTriFactor->solveInfo,
1179:                         barray, tempGPU->data().get());CHKERRCUDA(stat);

1181:   /* Next, solve U */
1182:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1183:                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1184:                         upTriFactor->csrMat->values->data().get(),
1185:                         upTriFactor->csrMat->row_offsets->data().get(),
1186:                         upTriFactor->csrMat->column_indices->data().get(),
1187:                         upTriFactor->solveInfo,
1188:                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1189:   PetscLogGpuTimeEnd();

1191:   VecCUDARestoreArrayRead(bb,&barray);
1192:   VecCUDARestoreArrayWrite(xx,&xarray);
1193:   WaitForGPU();CHKERRCUDA(ierr);
1194:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1195:   return(0);
1196: }

1198: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1199: {

1201:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1202:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1203:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1204:   PetscInt                     m = A->rmap->n,*ii,*ridx;
1205:   PetscErrorCode               ierr;
1206:   cusparseStatus_t             stat;
1207:   cudaError_t                  err;

1210:   if (A->pinnedtocpu) return(0);
1211:   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1212:     PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1213:     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1214:       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1215:       /* copy values only */
1216:       matrix->values->assign(a->a, a->a+a->nz);
1217:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1218:     } else {
1219:       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1220:       try {
1221:         cusparsestruct->nonzerorow=0;
1222:         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

1224:         if (a->compressedrow.use) {
1225:           m    = a->compressedrow.nrows;
1226:           ii   = a->compressedrow.i;
1227:           ridx = a->compressedrow.rindex;
1228:         } else {
1229:           /* Forcing compressed row on the GPU */
1230:           int k=0;
1231:           PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);
1232:           PetscMalloc1(cusparsestruct->nonzerorow, &ridx);
1233:           ii[0]=0;
1234:           for (int j = 0; j<m; j++) {
1235:             if ((a->i[j+1]-a->i[j])>0) {
1236:               ii[k]  = a->i[j];
1237:               ridx[k]= j;
1238:               k++;
1239:             }
1240:           }
1241:           ii[cusparsestruct->nonzerorow] = a->nz;
1242:           m = cusparsestruct->nonzerorow;
1243:         }

1245:         /* allocate space for the triangular factor information */
1246:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1247:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1248:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1249:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);

1251:         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1252:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1253:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1254:         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1255:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1256:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1257:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);

1259:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1260:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1261:           /* set the matrix */
1262:           CsrMatrix *matrix= new CsrMatrix;
1263:           matrix->num_rows = m;
1264:           matrix->num_cols = A->cmap->n;
1265:           matrix->num_entries = a->nz;
1266:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1267:           matrix->row_offsets->assign(ii, ii + m+1);

1269:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1270:           matrix->column_indices->assign(a->j, a->j+a->nz);

1272:           matrix->values = new THRUSTARRAY(a->nz);
1273:           matrix->values->assign(a->a, a->a+a->nz);

1275:           /* assign the pointer */
1276:           matstruct->mat = matrix;

1278:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1279: #if CUDA_VERSION>=4020
1280:           CsrMatrix *matrix= new CsrMatrix;
1281:           matrix->num_rows = m;
1282:           matrix->num_cols = A->cmap->n;
1283:           matrix->num_entries = a->nz;
1284:           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1285:           matrix->row_offsets->assign(ii, ii + m+1);

1287:           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1288:           matrix->column_indices->assign(a->j, a->j+a->nz);

1290:           matrix->values = new THRUSTARRAY(a->nz);
1291:           matrix->values->assign(a->a, a->a+a->nz);

1293:           cusparseHybMat_t hybMat;
1294:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1295:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1296:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1297:           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1298:               matstruct->descr, matrix->values->data().get(),
1299:               matrix->row_offsets->data().get(),
1300:               matrix->column_indices->data().get(),
1301:               hybMat, 0, partition);CHKERRCUDA(stat);
1302:           /* assign the pointer */
1303:           matstruct->mat = hybMat;

1305:           if (matrix) {
1306:             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1307:             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1308:             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1309:             delete (CsrMatrix*)matrix;
1310:           }
1311: #endif
1312:         }

1314:         /* assign the compressed row indices */
1315:         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1316:         matstruct->cprowIndices->assign(ridx,ridx+m);
1317:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1319:         /* assign the pointer */
1320:         cusparsestruct->mat = matstruct;

1322:         if (!a->compressedrow.use) {
1323:           PetscFree(ii);
1324:           PetscFree(ridx);
1325:         }
1326:         cusparsestruct->workVector = new THRUSTARRAY(m);
1327:       } catch(char *ex) {
1328:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1329:       }
1330:       cusparsestruct->nonzerostate = A->nonzerostate;
1331:     }
1332:     WaitForGPU();CHKERRCUDA(ierr);
1333:     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1334:     PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1335:   }
1336:   return(0);
1337: }

1339: struct VecCUDAPlusEquals
1340: {
1341:   template <typename Tuple>
1342:   __host__ __device__
1343:   void operator()(Tuple t)
1344:   {
1345:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1346:   }
1347: };

1349: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1350: {

1354:   MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);
1355:   return(0);
1356: }

1358: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1359: {
1360:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1361:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1362:   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1363:   const PetscScalar            *xarray;
1364:   PetscScalar                  *yarray;
1365:   PetscErrorCode               ierr;
1366:   cusparseStatus_t             stat;

1369:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1370:   MatSeqAIJCUSPARSECopyToGPU(A);
1371:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1372:   if (!matstructT) {
1373:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1374:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1375:   }
1376:   VecCUDAGetArrayRead(xx,&xarray);
1377:   VecSet(yy,0);
1378:   VecCUDAGetArrayWrite(yy,&yarray);

1380:   PetscLogGpuTimeBegin();
1381:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1382:     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1383:     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1384:                              mat->num_rows, mat->num_cols,
1385:                              mat->num_entries, matstructT->alpha, matstructT->descr,
1386:                              mat->values->data().get(), mat->row_offsets->data().get(),
1387:                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1388:                              yarray);CHKERRCUDA(stat);
1389:   } else {
1390: #if CUDA_VERSION>=4020
1391:     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1392:     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1393:                              matstructT->alpha, matstructT->descr, hybMat,
1394:                              xarray, matstructT->beta_zero,
1395:                              yarray);CHKERRCUDA(stat);
1396: #endif
1397:   }
1398:   PetscLogGpuTimeEnd();
1399:   VecCUDARestoreArrayRead(xx,&xarray);
1400:   VecCUDARestoreArrayWrite(yy,&yarray);
1401:   if (!cusparsestruct->stream) {
1402:     WaitForGPU();CHKERRCUDA(ierr);
1403:   }
1404:   PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);
1405:   return(0);
1406: }


1409: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1410: {
1411:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1412:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1413:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1414:   const PetscScalar            *xarray;
1415:   PetscScalar                  *zarray,*dptr,*beta;
1416:   PetscErrorCode               ierr;
1417:   cusparseStatus_t             stat;

1420:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1421:   MatSeqAIJCUSPARSECopyToGPU(A);
1422:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1423:   try {
1424:     VecCUDAGetArrayRead(xx,&xarray);
1425:     VecCUDAGetArray(zz,&zarray);
1426:     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1427:     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;

1429:     PetscLogGpuTimeBegin();
1430:     /* csr_spmv is multiply add */
1431:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1432:       /* here we need to be careful to set the number of rows in the multiply to the
1433:          number of compressed rows in the matrix ... which is equivalent to the
1434:          size of the workVector */
1435:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1436:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1437:                                mat->num_rows, mat->num_cols,
1438:                                mat->num_entries, matstruct->alpha, matstruct->descr,
1439:                                mat->values->data().get(), mat->row_offsets->data().get(),
1440:                                mat->column_indices->data().get(), xarray, beta,
1441:                                dptr);CHKERRCUDA(stat);
1442:     } else {
1443: #if CUDA_VERSION>=4020
1444:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1445:       if (cusparsestruct->workVector->size()) {
1446:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1447:                                  matstruct->alpha, matstruct->descr, hybMat,
1448:                                  xarray, beta,
1449:                                  dptr);CHKERRCUDA(stat);
1450:       }
1451: #endif
1452:     }
1453:     PetscLogGpuTimeEnd();

1455:     if (yy) {
1456:       if (dptr != zarray) {
1457:         VecCopy_SeqCUDA(yy,zz);
1458:       } else if (zz != yy) {
1459:         VecAXPY_SeqCUDA(zz,1.0,yy);
1460:       }
1461:     } else if (dptr != zarray) {
1462:       VecSet(zz,0);
1463:     }
1464:     /* scatter the data from the temporary into the full vector with a += operation */
1465:     PetscLogGpuTimeBegin();
1466:     if (dptr != zarray) {
1467:       thrust::device_ptr<PetscScalar> zptr;

1469:       zptr = thrust::device_pointer_cast(zarray);
1470:       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1471:                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1472:                        VecCUDAPlusEquals());
1473:     }
1474:     PetscLogGpuTimeEnd();
1475:     VecCUDARestoreArrayRead(xx,&xarray);
1476:     VecCUDARestoreArray(zz,&zarray);
1477:   } catch(char *ex) {
1478:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1479:   }
1480:   if (!yy) { /* MatMult */
1481:     if (!cusparsestruct->stream) {
1482:       WaitForGPU();CHKERRCUDA(ierr);
1483:     }
1484:   }
1485:   PetscLogGpuFlops(2.0*a->nz);
1486:   return(0);
1487: }

1489: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1490: {
1491:   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1492:   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1493:   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1494:   thrust::device_ptr<PetscScalar> zptr;
1495:   const PetscScalar               *xarray;
1496:   PetscScalar                     *zarray;
1497:   PetscErrorCode                  ierr;
1498:   cusparseStatus_t                stat;

1501:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1502:   MatSeqAIJCUSPARSECopyToGPU(A);
1503:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1504:   if (!matstructT) {
1505:     MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1506:     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1507:   }

1509:   try {
1510:     VecCopy_SeqCUDA(yy,zz);
1511:     VecCUDAGetArrayRead(xx,&xarray);
1512:     VecCUDAGetArray(zz,&zarray);
1513:     zptr = thrust::device_pointer_cast(zarray);

1515:     PetscLogGpuTimeBegin();
1516:     /* multiply add with matrix transpose */
1517:     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1518:       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1519:       /* here we need to be careful to set the number of rows in the multiply to the
1520:          number of compressed rows in the matrix ... which is equivalent to the
1521:          size of the workVector */
1522:       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1523:                                mat->num_rows, mat->num_cols,
1524:                                mat->num_entries, matstructT->alpha, matstructT->descr,
1525:                                mat->values->data().get(), mat->row_offsets->data().get(),
1526:                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1527:                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1528:     } else {
1529: #if CUDA_VERSION>=4020
1530:       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1531:       if (cusparsestruct->workVector->size()) {
1532:         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1533:             matstructT->alpha, matstructT->descr, hybMat,
1534:             xarray, matstructT->beta_zero,
1535:             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1536:       }
1537: #endif
1538:     }

1540:     /* scatter the data from the temporary into the full vector with a += operation */
1541:     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1542:         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1543:         VecCUDAPlusEquals());
1544:     PetscLogGpuTimeEnd();
1545:     VecCUDARestoreArrayRead(xx,&xarray);
1546:     VecCUDARestoreArray(zz,&zarray);

1548:   } catch(char *ex) {
1549:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1550:   }
1551:   WaitForGPU();CHKERRCUDA(ierr);
1552:   PetscLogGpuFlops(2.0*a->nz);
1553:   return(0);
1554: }

1556: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1557: {

1561:   MatAssemblyEnd_SeqAIJ(A,mode);
1562:   if (A->factortype==MAT_FACTOR_NONE) {
1563:     MatSeqAIJCUSPARSECopyToGPU(A);
1564:   }
1565:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1566:   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1567:   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1568:   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1569:   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1570:   return(0);
1571: }

1573: /* --------------------------------------------------------------------------------*/
1574: /*@
1575:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1576:    (the default parallel PETSc format). This matrix will ultimately pushed down
1577:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1578:    assembly performance the user should preallocate the matrix storage by setting
1579:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1580:    performance during matrix assembly can be increased by more than a factor of 50.

1582:    Collective

1584:    Input Parameters:
1585: +  comm - MPI communicator, set to PETSC_COMM_SELF
1586: .  m - number of rows
1587: .  n - number of columns
1588: .  nz - number of nonzeros per row (same for all rows)
1589: -  nnz - array containing the number of nonzeros in the various rows
1590:          (possibly different for each row) or NULL

1592:    Output Parameter:
1593: .  A - the matrix

1595:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1596:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1597:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1599:    Notes:
1600:    If nnz is given then nz is ignored

1602:    The AIJ format (also called the Yale sparse matrix format or
1603:    compressed row storage), is fully compatible with standard Fortran 77
1604:    storage.  That is, the stored row and column indices can begin at
1605:    either one (as in Fortran) or zero.  See the users' manual for details.

1607:    Specify the preallocated storage with either nz or nnz (not both).
1608:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1609:    allocation.  For large problems you MUST preallocate memory or you
1610:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1612:    By default, this format uses inodes (identical nodes) when possible, to
1613:    improve numerical efficiency of matrix-vector products and solves. We
1614:    search for consecutive rows with the same nonzero structure, thereby
1615:    reusing matrix information to achieve increased efficiency.

1617:    Level: intermediate

1619: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1620: @*/
1621: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1622: {

1626:   MatCreate(comm,A);
1627:   MatSetSizes(*A,m,n,m,n);
1628:   MatSetType(*A,MATSEQAIJCUSPARSE);
1629:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1630:   return(0);
1631: }

1633: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1634: {
1635:   PetscErrorCode   ierr;

1638:   if (A->factortype==MAT_FACTOR_NONE) {
1639:     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1640:       MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1641:     }
1642:   } else {
1643:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1644:   }
1645:   MatDestroy_SeqAIJ(A);
1646:   return(0);
1647: }

1649: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1650: {
1652:   Mat C;
1653:   cusparseStatus_t stat;
1654:   cusparseHandle_t handle=0;

1657:   MatDuplicate_SeqAIJ(A,cpvalues,B);
1658:   C    = *B;
1659:   PetscFree(C->defaultvectype);
1660:   PetscStrallocpy(VECCUDA,&C->defaultvectype);

1662:   /* inject CUSPARSE-specific stuff */
1663:   if (C->factortype==MAT_FACTOR_NONE) {
1664:     /* you cannot check the inode.use flag here since the matrix was just created.
1665:        now build a GPU matrix data structure */
1666:     C->spptr = new Mat_SeqAIJCUSPARSE;
1667:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1668:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1669:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1670:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1671:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1672:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1673:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1674:     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1675:   } else {
1676:     /* NEXT, set the pointers to the triangular factors */
1677:     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1678:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1679:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1680:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1681:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1682:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1683:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1684:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1685:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1686:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1687:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1688:     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1689:   }

1691:   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1692:   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1693:   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1694:   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1695:   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1696:   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1697:   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1698:   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1700:   PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);

1702:   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

1704:   PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1705:   return(0);
1706: }

1708: PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1709: {
1711:   cusparseStatus_t stat;
1712:   cusparseHandle_t handle=0;

1715:   PetscFree(B->defaultvectype);
1716:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

1718:   if (B->factortype==MAT_FACTOR_NONE) {
1719:     /* you cannot check the inode.use flag here since the matrix was just created.
1720:        now build a GPU matrix data structure */
1721:     B->spptr = new Mat_SeqAIJCUSPARSE;
1722:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1723:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1724:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1725:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1726:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1727:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1728:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1729:     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1730:   } else {
1731:     /* NEXT, set the pointers to the triangular factors */
1732:     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1733:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1734:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1735:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1736:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1737:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1738:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1739:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1740:     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1741:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1742:     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1743:   }

1745:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1746:   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1747:   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1748:   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1749:   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1750:   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1751:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1752:   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;

1754:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);

1756:   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;

1758:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
1759:   return(0);
1760: }

1762: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1763: {

1767:   MatCreate_SeqAIJ(B);
1768:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1769:   return(0);
1770: }

1772: /*MC
1773:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

1775:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1776:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1777:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

1779:    Options Database Keys:
1780: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1781: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1782: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

1784:   Level: beginner

1786: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1787: M*/

1789: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


1792: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1793: {

1797:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
1798:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
1799:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
1800:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
1801:   return(0);
1802: }


1805: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1806: {
1807:   cusparseStatus_t stat;
1808:   cusparseHandle_t handle;

1811:   if (*cusparsestruct) {
1812:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1813:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1814:     delete (*cusparsestruct)->workVector;
1815:     if (handle = (*cusparsestruct)->handle) {
1816:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1817:     }
1818:     delete *cusparsestruct;
1819:     *cusparsestruct = 0;
1820:   }
1821:   return(0);
1822: }

1824: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1825: {
1827:   if (*mat) {
1828:     delete (*mat)->values;
1829:     delete (*mat)->column_indices;
1830:     delete (*mat)->row_offsets;
1831:     delete *mat;
1832:     *mat = 0;
1833:   }
1834:   return(0);
1835: }

1837: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1838: {
1839:   cusparseStatus_t stat;
1840:   PetscErrorCode   ierr;

1843:   if (*trifactor) {
1844:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1845:     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1846:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
1847:     delete *trifactor;
1848:     *trifactor = 0;
1849:   }
1850:   return(0);
1851: }

1853: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1854: {
1855:   CsrMatrix        *mat;
1856:   cusparseStatus_t stat;
1857:   cudaError_t      err;

1860:   if (*matstruct) {
1861:     if ((*matstruct)->mat) {
1862:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1863:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1864:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1865:       } else {
1866:         mat = (CsrMatrix*)(*matstruct)->mat;
1867:         CsrMatrix_Destroy(&mat);
1868:       }
1869:     }
1870:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1871:     delete (*matstruct)->cprowIndices;
1872:     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1873:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1874:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1875:     delete *matstruct;
1876:     *matstruct = 0;
1877:   }
1878:   return(0);
1879: }

1881: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1882: {
1883:   cusparseHandle_t handle;
1884:   cusparseStatus_t stat;

1887:   if (*trifactors) {
1888:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1889:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1890:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1891:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1892:     delete (*trifactors)->rpermIndices;
1893:     delete (*trifactors)->cpermIndices;
1894:     delete (*trifactors)->workVector;
1895:     if (handle = (*trifactors)->handle) {
1896:       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1897:     }
1898:     delete *trifactors;
1899:     *trifactors = 0;
1900:   }
1901:   return(0);
1902: }