Actual source code: mpiaijcusparse.cu

petsc-master 2020-10-23
Report Typos and Errors
  1: #define PETSC_SKIP_SPINLOCK
  2: #define PETSC_SKIP_CXX_COMPLEX_FIX
  3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  5: #include <petscconf.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  8: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>

 10: PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
 11: {
 12:   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
 13:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
 14:   PetscErrorCode     ierr;
 15:   PetscInt           i;

 18:   PetscLayoutSetUp(B->rmap);
 19:   PetscLayoutSetUp(B->cmap);
 20:   if (d_nnz) {
 21:     for (i=0; i<B->rmap->n; i++) {
 22:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
 23:     }
 24:   }
 25:   if (o_nnz) {
 26:     for (i=0; i<B->rmap->n; i++) {
 27:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
 28:     }
 29:   }
 30:   if (!B->preallocated) {
 31:     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
 32:     MatCreate(PETSC_COMM_SELF,&b->A);
 33:     MatBindToCPU(b->A,B->boundtocpu);
 34:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
 35:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
 36:     MatCreate(PETSC_COMM_SELF,&b->B);
 37:     MatBindToCPU(b->B,B->boundtocpu);
 38:     MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
 39:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
 40:   }
 41:   MatSetType(b->A,MATSEQAIJCUSPARSE);
 42:   MatSetType(b->B,MATSEQAIJCUSPARSE);
 43:   if (b->lvec) {
 44:     VecSetType(b->lvec,VECSEQCUDA);
 45:   }
 46:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
 47:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
 48:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
 49:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
 50:   MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
 51:   MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
 52:   MatCUSPARSESetStream(b->A,cusparseStruct->stream);
 53:   MatCUSPARSESetStream(b->B,cusparseStruct->stream);

 55:   B->preallocated = PETSC_TRUE;
 56:   return(0);
 57: }

 59: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
 60: {
 61:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 63:   PetscInt       nt;

 66:   VecGetLocalSize(xx,&nt);
 67:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
 68:   VecScatterInitializeForGPU(a->Mvctx,xx);
 69:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 70:   (*a->A->ops->mult)(a->A,xx,yy);
 71:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 72:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
 73:   VecScatterFinalizeForGPU(a->Mvctx);
 74:   return(0);
 75: }

 77: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
 78: {
 79:   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;

 83:   if (A->factortype == MAT_FACTOR_NONE) {
 84:     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
 85:     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
 86:     if (d_mat) {
 87:       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
 88:       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
 89:       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
 90:       cudaError_t  err;
 91:       PetscScalar  *vals;
 92:       PetscInfo(A,"Zero device matrix diag and offfdiag\n");
 93:       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
 94:       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
 95:       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
 96:       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
 97:     }
 98:   }
 99:   MatZeroEntries(l->A);
100:   MatZeroEntries(l->B);

102:   return(0);
103: }
104: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
105: {
106:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
108:   PetscInt       nt;

111:   VecGetLocalSize(xx,&nt);
112:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
113:   VecScatterInitializeForGPU(a->Mvctx,xx);
114:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
115:   (*a->A->ops->multadd)(a->A,xx,yy,zz);
116:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
117:   (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
118:   VecScatterFinalizeForGPU(a->Mvctx);
119:   return(0);
120: }

122: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
123: {
124:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
126:   PetscInt       nt;

129:   VecGetLocalSize(xx,&nt);
130:   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
131:   VecScatterInitializeForGPU(a->Mvctx,a->lvec);
132:   (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
133:   (*a->A->ops->multtranspose)(a->A,xx,yy);
134:   VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
135:   VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
136:   VecScatterFinalizeForGPU(a->Mvctx);
137:   return(0);
138: }

140: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
141: {
142:   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
143:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

146:   switch (op) {
147:   case MAT_CUSPARSE_MULT_DIAG:
148:     cusparseStruct->diagGPUMatFormat = format;
149:     break;
150:   case MAT_CUSPARSE_MULT_OFFDIAG:
151:     cusparseStruct->offdiagGPUMatFormat = format;
152:     break;
153:   case MAT_CUSPARSE_ALL:
154:     cusparseStruct->diagGPUMatFormat    = format;
155:     cusparseStruct->offdiagGPUMatFormat = format;
156:     break;
157:   default:
158:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
159:   }
160:   return(0);
161: }

163: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
164: {
165:   MatCUSPARSEStorageFormat format;
166:   PetscErrorCode           ierr;
167:   PetscBool                flg;
168:   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
169:   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

172:   PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
173:   if (A->factortype==MAT_FACTOR_NONE) {
174:     PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
175:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
176:     if (flg) {
177:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
178:     }
179:     PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
180:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
181:     if (flg) {
182:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
183:     }
184:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
185:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
186:     if (flg) {
187:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
188:     }
189:   }
190:   PetscOptionsTail();
191:   return(0);
192: }

194: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
195: {
196:   PetscErrorCode             ierr;
197:   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
198:   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
199:   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
200:   PetscInt                    nnz_state = A->nonzerostate;
202:   if (d_mat) {
203:     cudaError_t                err;
204:     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
205:   }
206:   MatAssemblyEnd_MPIAIJ(A,mode);
207:   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
208:     VecSetType(mpiaij->lvec,VECSEQCUDA);
209:   }
210:   if (nnz_state > A->nonzerostate) {
211:     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
212:   }

214:   return(0);
215: }

217: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
218: {
219:   PetscErrorCode     ierr;
220:   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
221:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
222:   cudaError_t        err;
223:   cusparseStatus_t   stat;

226:   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
227:   if (cusparseStruct->deviceMat) {
228:     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
229:     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
230:     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
231:     PetscInfo(A,"Have device matrix\n");
232:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
233:     if (jaca->compressedrow.use) {
234:       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
235:     }
236:     if (jacb->compressedrow.use) {
237:       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
238:     }
239:     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
240:     err = cudaFree(d_mat);CHKERRCUDA(err);
241:   }
242:   try {
243:     if (aij->A) { MatCUSPARSEClearHandle(aij->A); }
244:     if (aij->B) { MatCUSPARSEClearHandle(aij->B); }
245:     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
246:     if (cusparseStruct->stream) {
247:       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
248:     }
249:     delete cusparseStruct;
250:   } catch(char *ex) {
251:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
252:   }
253:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
254:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
255:   MatDestroy_MPIAIJ(A);
256:   return(0);
257: }

259: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
260: {
261:   PetscErrorCode     ierr;
262:   Mat_MPIAIJ         *a;
263:   Mat_MPIAIJCUSPARSE *cusparseStruct;
264:   cusparseStatus_t   stat;
265:   Mat                A;

268:   if (reuse == MAT_INITIAL_MATRIX) {
269:     MatDuplicate(B,MAT_COPY_VALUES,newmat);
270:   } else if (reuse == MAT_REUSE_MATRIX) {
271:     MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
272:   }
273:   A = *newmat;

275:   PetscFree(A->defaultvectype);
276:   PetscStrallocpy(VECCUDA,&A->defaultvectype);

278:   a = (Mat_MPIAIJ*)A->data;
279:   if (a->A) { MatSetType(a->A,MATSEQAIJCUSPARSE); }
280:   if (a->B) { MatSetType(a->B,MATSEQAIJCUSPARSE); }
281:   if (a->lvec) {
282:     VecSetType(a->lvec,VECSEQCUDA);
283:   }

285:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
286:     a->spptr = new Mat_MPIAIJCUSPARSE;

288:     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
289:     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
290:     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
291:     cusparseStruct->stream              = 0;
292:     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
293:     cusparseStruct->deviceMat = NULL;
294:   }

296:   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
297:   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
298:   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
299:   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
300:   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
301:   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
302:   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;

304:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
305:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
306:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
307:   return(0);
308: }

310: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
311: {

315:   PetscCUDAInitializeCheck();
316:   MatCreate_MPIAIJ(A);
317:   MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
318:   return(0);
319: }

321: /*@
322:    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
323:    (the default parallel PETSc format).  This matrix will ultimately pushed down
324:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
325:    assembly performance the user should preallocate the matrix storage by setting
326:    the parameter nz (or the array nnz).  By setting these parameters accurately,
327:    performance during matrix assembly can be increased by more than a factor of 50.

329:    Collective

331:    Input Parameters:
332: +  comm - MPI communicator, set to PETSC_COMM_SELF
333: .  m - number of rows
334: .  n - number of columns
335: .  nz - number of nonzeros per row (same for all rows)
336: -  nnz - array containing the number of nonzeros in the various rows
337:          (possibly different for each row) or NULL

339:    Output Parameter:
340: .  A - the matrix

342:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
343:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
344:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

346:    Notes:
347:    If nnz is given then nz is ignored

349:    The AIJ format (also called the Yale sparse matrix format or
350:    compressed row storage), is fully compatible with standard Fortran 77
351:    storage.  That is, the stored row and column indices can begin at
352:    either one (as in Fortran) or zero.  See the users' manual for details.

354:    Specify the preallocated storage with either nz or nnz (not both).
355:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
356:    allocation.  For large problems you MUST preallocate memory or you
357:    will get TERRIBLE performance, see the users' manual chapter on matrices.

359:    By default, this format uses inodes (identical nodes) when possible, to
360:    improve numerical efficiency of matrix-vector products and solves. We
361:    search for consecutive rows with the same nonzero structure, thereby
362:    reusing matrix information to achieve increased efficiency.

364:    Level: intermediate

366: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
367: @*/
368: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
369: {
371:   PetscMPIInt    size;

374:   MatCreate(comm,A);
375:   MatSetSizes(*A,m,n,M,N);
376:   MPI_Comm_size(comm,&size);
377:   if (size > 1) {
378:     MatSetType(*A,MATMPIAIJCUSPARSE);
379:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
380:   } else {
381:     MatSetType(*A,MATSEQAIJCUSPARSE);
382:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
383:   }
384:   return(0);
385: }

387: /*MC
388:    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.

390:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
391:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
392:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

394:    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
395:    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
396:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
397:    for communicators controlling multiple processes.  It is recommended that you call both of
398:    the above preallocation routines for simplicity.

400:    Options Database Keys:
401: +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
402: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
403: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
404: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

406:   Level: beginner

408:  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
409: M
410: M*/

412: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
413: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
414: {
415: #if defined(PETSC_USE_CTABLE)
416:   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
417: #else
418:   PetscSplitCSRDataStructure **p_d_mat;
419:   PetscMPIInt                size,rank;
420:   MPI_Comm                   comm;
421:   PetscErrorCode             ierr;
422:   int                        *ai,*bi,*aj,*bj;
423:   PetscScalar                *aa,*ba;

426:   PetscObjectGetComm((PetscObject)A,&comm);
427:   MPI_Comm_size(comm,&size);
428:   MPI_Comm_rank(comm,&rank);
429:   if (A->factortype == MAT_FACTOR_NONE) {
430:     CsrMatrix *matrixA,*matrixB=NULL;
431:     if (size == 1) {
432:       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
433:       p_d_mat = &cusparsestruct->deviceMat;
434:       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
435:       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
436:         matrixA = (CsrMatrix*)matstruct->mat;
437:         bi = bj = NULL; ba = NULL;
438:       } else {
439:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
440:       }
441:     } else {
442:       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
443:       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
444:       p_d_mat = &spptr->deviceMat;
445:       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
446:       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
447:       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
448:       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
449:       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
450:         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
451:         matrixA = (CsrMatrix*)matstructA->mat;
452:         matrixB = (CsrMatrix*)matstructB->mat;
453:         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
454:         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
455:         ba = thrust::raw_pointer_cast(matrixB->values->data());
456:         if (rank==-1) {
457:           for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
458:             std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
459:           for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
460:             std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
461:           for(unsigned int i = 0; i < matrixB->values->size(); i++)
462:             std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
463:         }
464:       } else {
465:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
466:       }
467:     }
468:     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
469:     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
470:     aa = thrust::raw_pointer_cast(matrixA->values->data());
471:   } else {
472:     *B = NULL;
473:     return(0);
474:   }
475:   // act like MatSetValues because not called on host
476:   if (A->assembled) {
477:     if (A->was_assembled) {
478:       PetscInfo(A,"Assemble more than once already\n");
479:     }
480:     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
481:   } else {
482:     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
483:   }
484:   if (!*p_d_mat) {
485:     cudaError_t                 err;
486:     PetscSplitCSRDataStructure  *d_mat, h_mat;
487:     Mat_SeqAIJ                  *jaca;
488:     PetscInt                    n = A->rmap->n, nnz;
489:     // create and copy
490:     PetscInfo(A,"Create device matrix\n");
491:     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
492:     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
493:     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
494:     if (size == 1) {
495:       jaca = (Mat_SeqAIJ*)A->data;
496:       h_mat.nonzerostate = A->nonzerostate;
497:       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
498:       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
499:       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
500:       h_mat.offdiag.a = NULL;
501:       h_mat.seq = PETSC_TRUE;
502:     } else {
503:       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
504:       Mat_SeqAIJ  *jacb;
505:       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
506:       jaca = (Mat_SeqAIJ*)aij->A->data;
507:       jacb = (Mat_SeqAIJ*)aij->B->data;
508:       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
509:       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
510:       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
511:       // create colmap - this is ussually done (lazy) in MatSetValues
512:       aij->donotstash = PETSC_TRUE;
513:       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
514:       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
515:       PetscCalloc1(A->cmap->N+1,&aij->colmap);
516:       aij->colmap[A->cmap->N] = -9;
517:       PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));
518:       {
519:         PetscInt ii;
520:         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
521:       }
522:       if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
523:       // allocate B copy data
524:       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
525:       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
526:       nnz = jacb->i[n];

528:       if (jacb->compressedrow.use) {
529:         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
530:         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
531:       } else {
532:         h_mat.offdiag.i = bi;
533:       }
534:       h_mat.offdiag.j = bj;
535:       h_mat.offdiag.a = ba;

537:       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
538:       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
539:       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
540:       h_mat.offdiag.n = n;
541:     }
542:     // allocate A copy data
543:     nnz = jaca->i[n];
544:     h_mat.diag.n = n;
545:     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
546:     MPI_Comm_rank(comm,&h_mat.rank);
547:     if (jaca->compressedrow.use) {
548:       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
549:       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
550:     } else {
551:       h_mat.diag.i = ai;
552:     }
553:     h_mat.diag.j = aj;
554:     h_mat.diag.a = aa;
555:     // copy pointers and metdata to device
556:     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
557:     PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);
558:   } else {
559:     *B = *p_d_mat;
560:   }
561:   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
562:   return(0);
563: #endif
564: }