Actual source code: densecuda.cu

petsc-master 2020-10-23
Report Typos and Errors
  1: /*
  2:      Defines the matrix operations for sequential dense with CUDA
  3: */
  4: #include <petscpkg_version.h>
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  6: #include <../src/mat/impls/dense/seq/dense.h>
  7: #include <petsccublas.h>

  9: /* cublas definitions are here */
 10: #include <petsc/private/cudavecimpl.h>

 12: #if defined(PETSC_USE_COMPLEX)
 13: #if defined(PETSC_USE_REAL_SINGLE)
 14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
 17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
 20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
 21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
 24: #else /* complex double */
 25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
 28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
 31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
 32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
 35: #endif
 36: #else /* real single */
 37: #if defined(PETSC_USE_REAL_SINGLE)
 38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
 42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
 43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
 45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 48: #else /* real double */
 49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
 53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
 54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
 56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 59: #endif
 60: #endif

 62: typedef struct {
 63:   PetscScalar *d_v; /* pointer to the matrix on the GPU */
 64:   PetscBool   user_alloc;
 65:   PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
 66:   PetscBool   unplaced_user_alloc;
 67:   /* factorization support */
 68:   int         *d_fact_ipiv; /* device pivots */
 69:   PetscScalar *d_fact_work; /* device workspace */
 70:   int         fact_lwork;
 71:   int         *d_fact_info; /* device info */
 72:   /* workspace */
 73:   Vec         workvec;
 74: } Mat_SeqDenseCUDA;

 76: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
 77: {
 78:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
 79:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
 80:   PetscErrorCode   ierr;
 81:   PetscBool        iscuda;
 82:   cudaError_t      cerr;

 85:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
 86:   if (!iscuda) return(0);
 87:   /* it may happen CPU preallocation has not been performed */
 88:   PetscLayoutSetUp(A->rmap);
 89:   PetscLayoutSetUp(A->cmap);
 90:   if (cA->lda <= 0) cA->lda = A->rmap->n;
 91:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
 92:   if (!d_data) { /* petsc-allocated storage */
 93:     PetscIntMultError(cA->lda,A->cmap->n,NULL);
 94:     cerr = cudaMalloc((void**)&dA->d_v,cA->lda*A->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
 95:     dA->user_alloc = PETSC_FALSE;
 96:   } else { /* user-allocated storage */
 97:     dA->d_v        = d_data;
 98:     dA->user_alloc = PETSC_TRUE;
 99:     A->offloadmask = PETSC_OFFLOAD_GPU;
100:   }
101:   A->preallocated = PETSC_TRUE;
102:   A->assembled = PETSC_TRUE;
103:   return(0);
104: }

106: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
107: {
108:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
109:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
110:   PetscErrorCode   ierr;
111:   cudaError_t      cerr;

115:   PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
116:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
117:     if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
118:       MatSeqDenseSetPreallocation(A,NULL);
119:     }
120:     PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
121:     if (cA->lda > A->rmap->n) {
122:       PetscInt j,m = A->rmap->n;

124:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
125:         cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
126:       }
127:     } else {
128:       cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
129:     }
130:     PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
131:     PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);

133:     A->offloadmask = PETSC_OFFLOAD_BOTH;
134:   }
135:   return(0);
136: }

138: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
139: {
140:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
141:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
142:   PetscBool        copy;
143:   PetscErrorCode   ierr;
144:   cudaError_t      cerr;

148:   if (A->boundtocpu) return(0);
149:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
150:   PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
151:   if (copy) {
152:     if (!dA->d_v) { /* Allocate GPU memory if not present */
153:       MatSeqDenseCUDASetPreallocation(A,NULL);
154:     }
155:     PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
156:     if (cA->lda > A->rmap->n) {
157:       PetscInt j,m = A->rmap->n;

159:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
160:         cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
161:       }
162:     } else {
163:       cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
164:     }
165:     PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
166:     PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);

168:     A->offloadmask = PETSC_OFFLOAD_BOTH;
169:   }
170:   return(0);
171: }

173: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
174: {
175:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
176:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
177:   PetscErrorCode   ierr;

180:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
181:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
182:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
183:   if (aa->v) { MatSeqDenseCUDACopyToGPU(A); }
184:   dA->unplacedarray = dA->d_v;
185:   dA->unplaced_user_alloc = dA->user_alloc;
186:   dA->d_v = (PetscScalar*)a;
187:   dA->user_alloc = PETSC_TRUE;
188:   return(0);
189: }

191: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
192: {
193:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
194:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
195:   PetscErrorCode   ierr;

198:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
199:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
200:   if (a->v) { MatSeqDenseCUDACopyToGPU(A); }
201:   dA->d_v = dA->unplacedarray;
202:   dA->user_alloc = dA->unplaced_user_alloc;
203:   dA->unplacedarray = NULL;
204:   return(0);
205: }

207: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
208: {
209:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
210:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
211:   cudaError_t      cerr;

214:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
215:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
216:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
217:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
218:   dA->d_v = (PetscScalar*)a;
219:   dA->user_alloc = PETSC_FALSE;
220:   return(0);
221: }

223: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
224: {
225:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
226:   PetscErrorCode   ierr;

229:   if (!dA->d_v) {
230:     MatSeqDenseCUDASetPreallocation(A,NULL);
231:   }
232:   *a = dA->d_v;
233:   return(0);
234: }

236: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
237: {
239:   *a = NULL;
240:   return(0);
241: }

243: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
244: {
245:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
246:   PetscErrorCode   ierr;

249:   MatSeqDenseCUDACopyToGPU(A);
250:   *a   = dA->d_v;
251:   return(0);
252: }

254: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
255: {
257:   *a = NULL;
258:   return(0);
259: }

261: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
262: {
263:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
264:   PetscErrorCode   ierr;

267:   MatSeqDenseCUDACopyToGPU(A);
268:   *a   = dA->d_v;
269:   return(0);
270: }

272: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
273: {
275:   *a = NULL;
276:   return(0);
277: }

279: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
280: {
281: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
282:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
283:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
284:   PetscScalar        *da;
285:   PetscErrorCode     ierr;
286:   cudaError_t        ccer;
287:   cusolverStatus_t   cerr;
288:   cusolverDnHandle_t handle;
289:   int                n,lda;
290: #if defined(PETSC_USE_DEBUG)
291:   int                info;
292: #endif

295:   if (!A->rmap->n || !A->cmap->n) return(0);
296:   PetscCUSOLVERDnGetHandle(&handle);
297:   PetscMPIIntCast(A->cmap->n,&n);
298:   PetscMPIIntCast(a->lda,&lda);
299:   if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
300:   else if (A->factortype == MAT_FACTOR_CHOLESKY) {
301:     if (!dA->d_fact_ipiv) { /* spd */
302:       int il;

304:       MatDenseCUDAGetArray(A,&da);
305:       cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
306:       if (il > dA->fact_lwork) {
307:         dA->fact_lwork = il;

309:         ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
310:         ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
311:       }
312:       PetscLogGpuTimeBegin();
313:       cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
314:       ccer = WaitForCUDA();CHKERRCUDA(ccer);
315:       PetscLogGpuTimeEnd();
316:       MatDenseCUDARestoreArray(A,&da);
317:       /* TODO (write cuda kernel) */
318:       MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
319:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
320:   }
321: #if defined(PETSC_USE_DEBUG)
322:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
323:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
324:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
325: #endif
326:   PetscLogGpuFlops(1.0*n*n*n/3.0);
327:   A->ops->solve          = NULL;
328:   A->ops->solvetranspose = NULL;
329:   A->ops->matsolve       = NULL;
330:   A->factortype          = MAT_FACTOR_NONE;

332:   PetscFree(A->solvertype);
333:   return(0);
334: #else
335:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
336: #endif
337: }

339: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
340: {
341:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
342:   Mat_SeqDense       *x = (Mat_SeqDense*)X->data;
343:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
344:   const PetscScalar  *da;
345:   PetscScalar        *dx;
346:   cusolverDnHandle_t handle;
347:   PetscBool          iscuda;
348:   int                nrhs,n,lda,ldx;
349: #if defined(PETSC_USE_DEBUG)
350:   int                info;
351: #endif
352:   cudaError_t        ccer;
353:   cusolverStatus_t   cerr;
354:   PetscErrorCode     ierr;

357:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
358:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
359:   PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
360:   if (X != B) {
361:     MatCopy(B,X,SAME_NONZERO_PATTERN);
362:   }
363:   MatDenseCUDAGetArrayRead(A,&da);
364:   /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
365:   PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
366:   if (!iscuda) {
367:     MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
368:   }
369:   MatDenseCUDAGetArray(X,&dx);
370:   PetscMPIIntCast(A->rmap->n,&n);
371:   PetscMPIIntCast(X->cmap->n,&nrhs);
372:   PetscMPIIntCast(a->lda,&lda);
373:   PetscMPIIntCast(x->lda,&ldx);
374:   PetscCUSOLVERDnGetHandle(&handle);
375:   PetscLogGpuTimeBegin();
376:   if (A->factortype == MAT_FACTOR_LU) {
377:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
378:     cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
379:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
380:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
381:     if (!dA->d_fact_ipiv) { /* spd */
382:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
383:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
384:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
385:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
386:   ccer = WaitForCUDA();CHKERRCUDA(ccer);
387:   PetscLogGpuTimeEnd();
388:   MatDenseCUDARestoreArrayRead(A,&da);
389:   MatDenseCUDARestoreArray(X,&dx);
390:   if (!iscuda) {
391:     MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
392:   }
393: #if defined(PETSC_USE_DEBUG)
394:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
395:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
396:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
397: #endif
398:   PetscLogGpuFlops(nrhs*(2.0*n*n - n));
399:   return(0);
400: }

402: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
403: {
404:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
405:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
406:   const PetscScalar  *da;
407:   PetscScalar        *y;
408:   cusolverDnHandle_t handle;
409:   int                one = 1,n,lda;
410: #if defined(PETSC_USE_DEBUG)
411:   int                info;
412: #endif
413:   cudaError_t        ccer;
414:   cusolverStatus_t   cerr;
415:   PetscBool          iscuda;
416:   PetscErrorCode     ierr;

419:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
420:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
421:   PetscMPIIntCast(A->rmap->n,&n);
422:   /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
423:   PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
424:   if (iscuda) {
425:     VecCopy(xx,yy);
426:     VecCUDAGetArray(yy,&y);
427:   } else {
428:     if (!dA->workvec) {
429:       MatCreateVecs(A,&dA->workvec,NULL);
430:     }
431:     VecCopy(xx,dA->workvec);
432:     VecCUDAGetArray(dA->workvec,&y);
433:   }
434:   MatDenseCUDAGetArrayRead(A,&da);
435:   PetscMPIIntCast(a->lda,&lda);
436:   PetscCUSOLVERDnGetHandle(&handle);
437:   PetscLogGpuTimeBegin();
438:   if (A->factortype == MAT_FACTOR_LU) {
439:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
440:     cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
441:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
442:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
443:     if (!dA->d_fact_ipiv) { /* spd */
444:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
445:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
446:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
447:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
448:   ccer = WaitForCUDA();CHKERRCUDA(ccer);
449:   PetscLogGpuTimeEnd();
450:   if (iscuda) {
451:     VecCUDARestoreArray(yy,&y);
452:   } else {
453:     VecCUDARestoreArray(dA->workvec,&y);
454:     VecCopy(dA->workvec,yy);
455:   }
456:   MatDenseCUDARestoreArrayRead(A,&da);
457: #if defined(PETSC_USE_DEBUG)
458:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
459:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
460:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
461: #endif
462:   PetscLogGpuFlops(2.0*n*n - n);
463:   return(0);
464: }

466: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
467: {
468:   PetscErrorCode     ierr;

471:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
472:   return(0);
473: }

475: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
476: {
477:   PetscErrorCode     ierr;

480:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
481:   return(0);
482: }

484: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
485: {
486:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
487:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
488:   PetscScalar        *da;
489:   int                m,n,lda;
490: #if defined(PETSC_USE_DEBUG)
491:   int                info;
492: #endif
493:   cusolverStatus_t   cerr;
494:   cusolverDnHandle_t handle;
495:   cudaError_t        ccer;
496:   PetscErrorCode     ierr;

499:   if (!A->rmap->n || !A->cmap->n) return(0);
500:   PetscCUSOLVERDnGetHandle(&handle);
501:   MatDenseCUDAGetArray(A,&da);
502:   PetscMPIIntCast(A->cmap->n,&n);
503:   PetscMPIIntCast(A->rmap->n,&m);
504:   PetscMPIIntCast(a->lda,&lda);
505:   PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
506:   if (!dA->d_fact_ipiv) {
507:     ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
508:   }
509:   if (!dA->fact_lwork) {
510:     cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
511:     ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
512:   }
513:   if (!dA->d_fact_info) {
514:     ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
515:   }
516:   PetscLogGpuTimeBegin();
517:   cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
518:   ccer = WaitForCUDA();CHKERRCUDA(ccer);
519:   PetscLogGpuTimeEnd();
520:   MatDenseCUDARestoreArray(A,&da);
521: #if defined(PETSC_USE_DEBUG)
522:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
523:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
524:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
525: #endif
526:   A->factortype = MAT_FACTOR_LU;
527:   PetscLogGpuFlops(2.0*n*n*m/3.0);

529:   A->ops->solve          = MatSolve_SeqDenseCUDA;
530:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
531:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

533:   PetscFree(A->solvertype);
534:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
535:   return(0);
536: }

538: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
539: {
540:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
541:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
542:   PetscScalar        *da;
543:   int                n,lda;
544: #if defined(PETSC_USE_DEBUG)
545:   int                info;
546: #endif
547:   cusolverStatus_t   cerr;
548:   cusolverDnHandle_t handle;
549:   cudaError_t        ccer;
550:   PetscErrorCode     ierr;

553:   if (!A->rmap->n || !A->cmap->n) return(0);
554:   PetscCUSOLVERDnGetHandle(&handle);
555:   PetscMPIIntCast(A->rmap->n,&n);
556:   PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
557:   if (A->spd) {
558:     MatDenseCUDAGetArray(A,&da);
559:     PetscMPIIntCast(a->lda,&lda);
560:     if (!dA->fact_lwork) {
561:       cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
562:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
563:     }
564:     if (!dA->d_fact_info) {
565:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
566:     }
567:     PetscLogGpuTimeBegin();
568:     cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
569:     ccer = WaitForCUDA();CHKERRCUDA(ccer);
570:     PetscLogGpuTimeEnd();

572:     MatDenseCUDARestoreArray(A,&da);
573: #if defined(PETSC_USE_DEBUG)
574:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
575:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
576:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
577: #endif
578:     A->factortype = MAT_FACTOR_CHOLESKY;
579:     PetscLogGpuFlops(1.0*n*n*n/3.0);
580:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
581: #if 0
582:     /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
583:        The code below should work, and it can be activated when *sytrs routines will be available */
584:     if (!dA->d_fact_ipiv) {
585:       ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
586:     }
587:     if (!dA->fact_lwork) {
588:       cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
589:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
590:     }
591:     if (!dA->d_fact_info) {
592:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
593:     }
594:     PetscLogGpuTimeBegin();
595:     cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
596:     PetscLogGpuTimeEnd();
597: #endif

599:   A->ops->solve          = MatSolve_SeqDenseCUDA;
600:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
601:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

603:   PetscFree(A->solvertype);
604:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
605:   return(0);
606: }

608: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
609: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
610: {
611:   const PetscScalar *da,*db;
612:   PetscScalar       *dc;
613:   PetscScalar       one=1.0,zero=0.0;
614:   int               m,n,k;
615:   PetscInt          alda,blda,clda;
616:   PetscErrorCode    ierr;
617:   cublasHandle_t    cublasv2handle;
618:   PetscBool         Aiscuda,Biscuda;
619:   cublasStatus_t    berr;
620:   cudaError_t       cerr;

623:   /* we may end up with SEQDENSE as one of the arguments */
624:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
625:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
626:   if (!Aiscuda) {
627:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
628:   }
629:   if (!Biscuda) {
630:     MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
631:   }
632:   PetscMPIIntCast(C->rmap->n,&m);
633:   PetscMPIIntCast(C->cmap->n,&n);
634:   if (tA) {
635:     PetscMPIIntCast(A->rmap->n,&k);
636:   } else {
637:     PetscMPIIntCast(A->cmap->n,&k);
638:   }
639:   if (!m || !n || !k) return(0);
640:   PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
641:   MatDenseCUDAGetArrayRead(A,&da);
642:   MatDenseCUDAGetArrayRead(B,&db);
643:   MatDenseCUDAGetArrayWrite(C,&dc);
644:   MatDenseGetLDA(A,&alda);
645:   MatDenseGetLDA(B,&blda);
646:   MatDenseGetLDA(C,&clda);
647:   PetscCUBLASGetHandle(&cublasv2handle);
648:   PetscLogGpuTimeBegin();
649:   berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
650:                      m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
651:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
652:   PetscLogGpuTimeEnd();
653:   PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
654:   MatDenseCUDARestoreArrayRead(A,&da);
655:   MatDenseCUDARestoreArrayRead(B,&db);
656:   MatDenseCUDARestoreArrayWrite(C,&dc);
657:   if (!Aiscuda) {
658:     MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
659:   }
660:   if (!Biscuda) {
661:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
662:   }
663:   return(0);
664: }

666: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
667: {

671:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
672:   return(0);
673: }

675: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
676: {

680:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
681:   return(0);
682: }

684: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
685: {

689:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
690:   return(0);
691: }

693: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
694: {

698:   MatProductSetFromOptions_SeqDense(C);
699:   return(0);
700: }

702: /* zz = op(A)*xx + yy
703:    if yy == NULL, only MatMult */
704: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
705: {
706:   Mat_SeqDense      *mat = (Mat_SeqDense*)A->data;
707:   const PetscScalar *xarray,*da;
708:   PetscScalar       *zarray;
709:   PetscScalar       one=1.0,zero=0.0;
710:   int               m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
711:   cublasHandle_t    cublasv2handle;
712:   cublasStatus_t    berr;
713:   PetscErrorCode    ierr;

716:   if (yy && yy != zz) { /* mult add */
717:     VecCopy_SeqCUDA(yy,zz);
718:   }
719:   if (!A->rmap->n || !A->cmap->n) {
720:     if (!yy) { /* mult only */
721:       VecSet_SeqCUDA(zz,0.0);
722:     }
723:     return(0);
724:   }
725:   PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
726:   PetscMPIIntCast(A->rmap->n,&m);
727:   PetscMPIIntCast(A->cmap->n,&n);
728:   PetscMPIIntCast(mat->lda,&lda);
729:   PetscCUBLASGetHandle(&cublasv2handle);
730:   MatDenseCUDAGetArrayRead(A,&da);
731:   VecCUDAGetArrayRead(xx,&xarray);
732:   VecCUDAGetArray(zz,&zarray);
733:   PetscLogGpuTimeBegin();
734:   berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
735:                      m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
736:   PetscLogGpuTimeEnd();
737:   PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
738:   VecCUDARestoreArrayRead(xx,&xarray);
739:   VecCUDARestoreArray(zz,&zarray);
740:   MatDenseCUDARestoreArrayRead(A,&da);
741:   return(0);
742: }

744: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
745: {

749:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
750:   return(0);
751: }

753: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
754: {

758:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
759:   return(0);
760: }

762: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
763: {

767:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
768:   return(0);
769: }

771: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
772: {

776:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
777:   return(0);
778: }

780: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
781: {
782:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

786:   MatSeqDenseCUDACopyFromGPU(A);
787:   *array = mat->v;
788:   return(0);
789: }

791: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
792: {
793:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

797:   if (!mat->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
798:     MatSeqDenseSetPreallocation(A,NULL);
799:   }
800:   *array = mat->v;
801:   A->offloadmask = PETSC_OFFLOAD_CPU;
802:   return(0);
803: }

805: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
806: {
807:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

811:   MatSeqDenseCUDACopyFromGPU(A);
812:   *array = mat->v;
813:   A->offloadmask = PETSC_OFFLOAD_CPU;
814:   return(0);
815: }

817: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
818: {
819:   Mat_SeqDense   *y = (Mat_SeqDense*)Y->data;
820:   PetscScalar    *dy;
821:   int            j,N,m,lday,one = 1;
822:   cublasHandle_t cublasv2handle;
823:   cublasStatus_t berr;
825:   cudaError_t    cerr;

828:   PetscCUBLASGetHandle(&cublasv2handle);
829:   MatDenseCUDAGetArray(Y,&dy);
830:   PetscMPIIntCast(Y->rmap->n*Y->cmap->n,&N);
831:   PetscMPIIntCast(Y->rmap->n,&m);
832:   PetscMPIIntCast(y->lda,&lday);
833:   PetscInfo2(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
834:   PetscLogGpuTimeBegin();
835:   if (lday>m) {
836:     for (j=0; j<Y->cmap->n; j++) {
837:       berr = cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);CHKERRCUBLAS(berr);
838:     }
839:   } else {
840:     berr = cublasXscal(cublasv2handle,N,&alpha,dy,one);CHKERRCUBLAS(berr);
841:   }
842:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
843:   PetscLogGpuTimeEnd();
844:   PetscLogGpuFlops(N);
845:   MatDenseCUDARestoreArray(Y,&dy);
846:   return(0);
847: }

849: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
850: {
851:   Mat_SeqDense      *x = (Mat_SeqDense*)X->data;
852:   Mat_SeqDense      *y = (Mat_SeqDense*)Y->data;
853:   const PetscScalar *dx;
854:   PetscScalar       *dy;
855:   int               j,N,m,ldax,lday,one = 1;
856:   cublasHandle_t    cublasv2handle;
857:   cublasStatus_t    berr;
858:   PetscErrorCode    ierr;
859:   cudaError_t       cerr;

862:   if (!X->rmap->n || !X->cmap->n) return(0);
863:   PetscCUBLASGetHandle(&cublasv2handle);
864:   MatDenseCUDAGetArrayRead(X,&dx);
865:   if (alpha != 0.0) {
866:     MatDenseCUDAGetArray(Y,&dy);
867:   } else {
868:     MatDenseCUDAGetArrayWrite(Y,&dy);
869:   }
870:   PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
871:   PetscMPIIntCast(X->rmap->n,&m);
872:   PetscMPIIntCast(x->lda,&ldax);
873:   PetscMPIIntCast(y->lda,&lday);
874:   PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
875:   PetscLogGpuTimeBegin();
876:   if (ldax>m || lday>m) {
877:     for (j=0; j<X->cmap->n; j++) {
878:       berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
879:     }
880:   } else {
881:     berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
882:   }
883:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
884:   PetscLogGpuTimeEnd();
885:   PetscLogGpuFlops(PetscMax(2.*N-1,0));
886:   MatDenseCUDARestoreArrayRead(X,&dx);
887:   if (alpha != 0.0) {
888:     MatDenseCUDARestoreArray(Y,&dy);
889:   } else {
890:     MatDenseCUDARestoreArrayWrite(Y,&dy);
891:   }
892:   return(0);
893: }

895: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
896: {
897:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
898:   cudaError_t      cerr;
899:   PetscErrorCode   ierr;

902:   if (dA) {
903:     if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
904:     if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
905:     cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
906:     cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
907:     cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
908:     VecDestroy(&dA->workvec);
909:   }
910:   PetscFree(A->spptr);
911:   return(0);
912: }

914: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
915: {
916:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

920:   /* prevent to copy back data if we own the data pointer */
921:   if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
922:   MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
923:   MatDestroy_SeqDense(A);
924:   return(0);
925: }

927: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
928: {

932:   MatCreate(PetscObjectComm((PetscObject)A),B);
933:   MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
934:   MatSetType(*B,((PetscObject)A)->type_name);
935:   MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
936:   if (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) {
937:     Mat_SeqDense      *a = (Mat_SeqDense*)A->data;
938:     const PetscScalar *da;
939:     PetscScalar       *db;
940:     cudaError_t       cerr;
941:     PetscInt          ldb;

943:     MatDenseCUDAGetArrayRead(A,&da);
944:     MatDenseCUDAGetArrayWrite(*B,&db);
945:     MatDenseGetLDA(*B,&ldb);
946:     if (a->lda > A->rmap->n || ldb > A->rmap->n) {
947:       PetscInt j,m = A->rmap->n;

949:       for (j=0; j<A->cmap->n; j++) { /* it can be done better */
950:         cerr = cudaMemcpy(db+j*ldb,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
951:       }
952:     } else {
953:       cerr = cudaMemcpy(db,da,(sizeof(PetscScalar)*A->cmap->n)*A->rmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
954:     }
955:     MatDenseCUDARestoreArrayRead(A,&da);
956:     MatDenseCUDARestoreArrayWrite(*B,&db);
957:     (*B)->offloadmask = PETSC_OFFLOAD_BOTH;
958:   }
959:   return(0);
960: }

962: #include <petsc/private/vecimpl.h>

964: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
965: {
966:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
967:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
968:   PetscErrorCode   ierr;
969:   PetscScalar      *x;
970:   PetscBool        viscuda;
971:   cudaError_t      cerr;

974:   PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
975:   if (viscuda && !v->boundtocpu) { /* update device data */
976:     VecCUDAGetArrayWrite(v,&x);
977:     if (A->offloadmask & PETSC_OFFLOAD_GPU) {
978:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);CHKERRCUDA(cerr);
979:     } else {
980:       cerr = cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
981:     }
982:     VecCUDARestoreArrayWrite(v,&x);
983:   } else { /* update host data */
984:     VecGetArrayWrite(v,&x);
985:     if (A->offloadmask & PETSC_OFFLOAD_CPU) {
986:       PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
987:     } else {
988:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
989:     }
990:     VecRestoreArrayWrite(v,&x);
991:   }
992:   return(0);
993: }

995: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
996: {

1000:   MatCreate(PetscObjectComm((PetscObject)A),fact);
1001:   MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1002:   MatSetType(*fact,MATSEQDENSECUDA);
1003:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU ) {
1004:     (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1005:   } else {
1006:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1007:   }
1008:   (*fact)->factortype = ftype;

1010:   PetscFree((*fact)->solvertype);
1011:   PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1012:   return(0);
1013: }

1015: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1016: {
1017:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1021:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1022:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1023:   if (!a->cvec) {
1024:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,NULL,&a->cvec);
1025:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1026:   }
1027:   a->vecinuse = col + 1;
1028:   MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1029:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1030:   *v   = a->cvec;
1031:   return(0);
1032: }

1034: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1035: {
1036:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1040:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1041:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1042:   a->vecinuse = 0;
1043:   MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1044:   VecCUDAResetArray(a->cvec);
1045:   *v   = NULL;
1046:   return(0);
1047: }

1049: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1050: {
1051:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1055:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1056:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1057:   if (!a->cvec) {
1058:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,NULL,&a->cvec);
1059:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1060:   }
1061:   a->vecinuse = col + 1;
1062:   MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1063:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1064:   VecLockReadPush(a->cvec);
1065:   *v   = a->cvec;
1066:   return(0);
1067: }

1069: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1070: {
1071:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1075:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1076:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1077:   a->vecinuse = 0;
1078:   MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1079:   VecLockReadPop(a->cvec);
1080:   VecCUDAResetArray(a->cvec);
1081:   *v   = NULL;
1082:   return(0);
1083: }

1085: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1086: {
1087:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1091:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1092:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1093:   if (!a->cvec) {
1094:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,NULL,&a->cvec);
1095:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1096:   }
1097:   a->vecinuse = col + 1;
1098:   MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1099:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1100:   *v   = a->cvec;
1101:   return(0);
1102: }

1104: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1105: {
1106:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1110:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1111:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1112:   a->vecinuse = 0;
1113:   MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1114:   VecCUDAResetArray(a->cvec);
1115:   *v   = NULL;
1116:   return(0);
1117: }

1119: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1120: {
1121:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
1122:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1123:   PetscErrorCode   ierr;

1126:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1127:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1128:   if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1129:     MatDestroy(&a->cmat);
1130:   }
1131:   MatSeqDenseCUDACopyToGPU(A);
1132:   if (!a->cmat) {
1133:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v + (size_t)cbegin * (size_t)a->lda,&a->cmat);
1134:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1135:   } else {
1136:     MatDenseCUDAPlaceArray(a->cmat,dA->d_v + (size_t)cbegin * (size_t)a->lda);
1137:   }
1138:   MatDenseSetLDA(a->cmat,a->lda);
1139:   if (a->v) { MatDensePlaceArray(a->cmat,a->v + (size_t)cbegin * (size_t)a->lda); }
1140:   a->cmat->offloadmask = A->offloadmask;
1141:   a->matinuse = cbegin + 1;
1142:   *v = a->cmat;
1143:   return(0);
1144: }

1146: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1147: {
1148:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1152:   if (!a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetSubMatrix() first");
1153:   if (!a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column matrix");
1154:   if (*v != a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Not the matrix obtained from MatDenseGetSubMatrix()");
1155:   a->matinuse = 0;
1156:   A->offloadmask = PETSC_OFFLOAD_GPU;
1157:   MatDenseCUDAResetArray(a->cmat);
1158:   MatDenseResetArray(a->cmat);
1159:   *v = NULL;
1160:   return(0);
1161: }

1163: static PetscErrorCode  MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1164: {
1165:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
1166:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1167:   PetscBool        data;

1170:   data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1171:   if (!dA->user_alloc && data && cA->lda!=lda) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"LDA cannot be changed after allocation of internal storage");
1172:   if (lda < A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"LDA %D must be at least matrix dimension %D",lda,A->rmap->n);
1173:   cA->lda = lda;
1174:   return(0);
1175: }

1177: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1178: {
1179:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1183:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1184:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1185:   A->boundtocpu = flg;
1186:   if (!flg) {
1187:     PetscBool iscuda;

1189:     PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1190:     if (!iscuda) {
1191:       VecDestroy(&a->cvec);
1192:     }
1193:     PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1194:     if (!iscuda) {
1195:       MatDestroy(&a->cmat);
1196:     }
1197:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1198:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1199:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1200:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1201:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1202:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1203:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1204:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1205:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1206:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1207:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1208:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);

1210:     A->ops->duplicate               = MatDuplicate_SeqDenseCUDA;
1211:     A->ops->mult                    = MatMult_SeqDenseCUDA;
1212:     A->ops->multadd                 = MatMultAdd_SeqDenseCUDA;
1213:     A->ops->multtranspose           = MatMultTranspose_SeqDenseCUDA;
1214:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseCUDA;
1215:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1216:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1217:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1218:     A->ops->axpy                    = MatAXPY_SeqDenseCUDA;
1219:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseCUDA;
1220:     A->ops->lufactor                = MatLUFactor_SeqDenseCUDA;
1221:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseCUDA;
1222:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDenseCUDA;
1223:     A->ops->scale                   = MatScale_SeqDenseCUDA;
1224:   } else {
1225:     /* make sure we have an up-to-date copy on the CPU */
1226:     MatSeqDenseCUDACopyFromGPU(A);
1227:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1228:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1229:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1230:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1231:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1232:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1233:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1234:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1235:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1236:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1237:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1238:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);

1240:     A->ops->duplicate               = MatDuplicate_SeqDense;
1241:     A->ops->mult                    = MatMult_SeqDense;
1242:     A->ops->multadd                 = MatMultAdd_SeqDense;
1243:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
1244:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
1245:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1246:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
1247:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1248:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1249:     A->ops->axpy                    = MatAXPY_SeqDense;
1250:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
1251:     A->ops->lufactor                = MatLUFactor_SeqDense;
1252:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1253:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDense;
1254:     A->ops->scale                   = MatScale_SeqDense;
1255:   }
1256:   if (a->cmat) {
1257:     MatBindToCPU(a->cmat,flg);
1258:   }
1259:   return(0);
1260: }

1262: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1263: {
1264:   Mat              B;
1265:   PetscErrorCode   ierr;

1268:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1269:     /* TODO these cases should be optimized */
1270:     MatConvert_Basic(M,type,reuse,newmat);
1271:     return(0);
1272:   }

1274:   B    = *newmat;
1275:   MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1276:   MatReset_SeqDenseCUDA(B);
1277:   PetscFree(B->defaultvectype);
1278:   PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1279:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1280:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1281:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1282:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1283:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1284:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1285:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1286:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1287:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1288:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1289:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1290:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);

1292:   B->ops->bindtocpu = NULL;
1293:   B->ops->destroy = MatDestroy_SeqDense;
1294:   B->offloadmask = PETSC_OFFLOAD_CPU;
1295:   return(0);
1296: }

1298: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1299: {
1300:   Mat_SeqDenseCUDA *dB;
1301:   Mat              B;
1302:   PetscErrorCode   ierr;

1305:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1306:     /* TODO these cases should be optimized */
1307:     MatConvert_Basic(M,type,reuse,newmat);
1308:     return(0);
1309:   }

1311:   B    = *newmat;
1312:   PetscFree(B->defaultvectype);
1313:   PetscStrallocpy(VECCUDA,&B->defaultvectype);
1314:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1315:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",            MatConvert_SeqDenseCUDA_SeqDense);
1316:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",                        MatDenseCUDAGetArray_SeqDenseCUDA);
1317:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",                    MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1318:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",                   MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1319:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",                    MatDenseCUDARestoreArray_SeqDenseCUDA);
1320:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",                MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1321:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",               MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1322:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",                      MatDenseCUDAPlaceArray_SeqDenseCUDA);
1323:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",                      MatDenseCUDAResetArray_SeqDenseCUDA);
1324:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",                    MatDenseCUDAReplaceArray_SeqDenseCUDA);
1325:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);

1327:   PetscNewLog(B,&dB);
1328:   B->spptr = dB;

1330:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1332:   MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1333:   B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1334:   B->ops->destroy  = MatDestroy_SeqDenseCUDA;
1335:   return(0);
1336: }

1338: /*@C
1339:    MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.

1341:    Collective

1343:    Input Parameters:
1344: +  comm - MPI communicator
1345: .  m - number of rows
1346: .  n - number of columns
1347: -  data - optional location of GPU matrix data.  Set data=NULL for PETSc
1348:    to control matrix memory allocation.

1350:    Output Parameter:
1351: .  A - the matrix

1353:    Notes:

1355:    Level: intermediate

1357: .seealso: MatCreate(), MatCreateSeqDense()
1358: @*/
1359: PetscErrorCode  MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1360: {
1362:   PetscMPIInt    size;

1365:   MPI_Comm_size(comm,&size);
1366:   if (size > 1) SETERRQ1(comm,PETSC_ERR_ARG_WRONG,"Invalid communicator size %d",size);
1367:   MatCreate(comm,A);
1368:   MatSetSizes(*A,m,n,m,n);
1369:   MatSetType(*A,MATSEQDENSECUDA);
1370:   MatSeqDenseCUDASetPreallocation(*A,data);
1371:   return(0);
1372: }

1374: /*MC
1375:    MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.

1377:    Options Database Keys:
1378: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()

1380:   Level: beginner
1381: M*/
1382: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1383: {

1387:   PetscCUDAInitializeCheck();
1388:   MatCreate_SeqDense(B);
1389:   MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1390:   return(0);
1391: }