Actual source code: densecuda.cu

petsc-main 2021-04-20
Report Typos and Errors
  1: /*
  2:      Defines the matrix operations for sequential dense with CUDA
  3: */
  4: #include <petscpkg_version.h>
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  6: #include <../src/mat/impls/dense/seq/dense.h>
  7: #include <petsccublas.h>

  9: /* cublas definitions are here */
 10: #include <petsc/private/cudavecimpl.h>

 12: #if defined(PETSC_USE_COMPLEX)
 13: #if defined(PETSC_USE_REAL_SINGLE)
 14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
 17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
 20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
 21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
 24: #else /* complex double */
 25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
 28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
 31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
 32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
 35: #endif
 36: #else /* real single */
 37: #if defined(PETSC_USE_REAL_SINGLE)
 38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
 42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
 43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
 45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 48: #else /* real double */
 49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
 53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
 54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
 56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 59: #endif
 60: #endif

 62: typedef struct {
 63:   PetscScalar *d_v; /* pointer to the matrix on the GPU */
 64:   PetscBool   user_alloc;
 65:   PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
 66:   PetscBool   unplaced_user_alloc;
 67:   /* factorization support */
 68:   int         *d_fact_ipiv; /* device pivots */
 69:   PetscScalar *d_fact_work; /* device workspace */
 70:   int         fact_lwork;
 71:   int         *d_fact_info; /* device info */
 72:   /* workspace */
 73:   Vec         workvec;
 74: } Mat_SeqDenseCUDA;

 76: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
 77: {
 78:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
 79:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
 80:   PetscErrorCode   ierr;
 81:   PetscBool        iscuda;
 82:   cudaError_t      cerr;

 85:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
 86:   if (!iscuda) return(0);
 87:   /* it may happen CPU preallocation has not been performed */
 88:   PetscLayoutSetUp(A->rmap);
 89:   PetscLayoutSetUp(A->cmap);
 90:   if (cA->lda <= 0) cA->lda = A->rmap->n;
 91:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
 92:   if (!d_data) { /* petsc-allocated storage */
 93:     size_t sz;
 94:     PetscIntMultError(cA->lda,A->cmap->n,NULL);
 95:     sz   = cA->lda*A->cmap->n*sizeof(PetscScalar);
 96:     cerr = cudaMalloc((void**)&dA->d_v,sz);CHKERRCUDA(cerr);
 97:     cerr = cudaMemset(dA->d_v,0,sz);CHKERRCUDA(cerr);
 98:     dA->user_alloc = PETSC_FALSE;
 99:   } else { /* user-allocated storage */
100:     dA->d_v        = d_data;
101:     dA->user_alloc = PETSC_TRUE;
102:   }
103:   A->offloadmask  = PETSC_OFFLOAD_GPU;
104:   A->preallocated = PETSC_TRUE;
105:   A->assembled    = PETSC_TRUE;
106:   return(0);
107: }

109: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
110: {
111:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
112:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
113:   PetscErrorCode   ierr;
114:   cudaError_t      cerr;

118:   PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
119:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
120:     if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
121:       MatSeqDenseSetPreallocation(A,NULL);
122:     }
123:     PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
124:     if (cA->lda > A->rmap->n) {
125:       PetscInt j,m = A->rmap->n;

127:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
128:         cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
129:       }
130:     } else {
131:       cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
132:     }
133:     PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
134:     PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);

136:     A->offloadmask = PETSC_OFFLOAD_BOTH;
137:   }
138:   return(0);
139: }

141: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
142: {
143:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
144:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
145:   PetscBool        copy;
146:   PetscErrorCode   ierr;
147:   cudaError_t      cerr;

151:   if (A->boundtocpu) return(0);
152:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
153:   PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
154:   if (copy) {
155:     if (!dA->d_v) { /* Allocate GPU memory if not present */
156:       MatSeqDenseCUDASetPreallocation(A,NULL);
157:     }
158:     PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
159:     if (cA->lda > A->rmap->n) {
160:       PetscInt j,m = A->rmap->n;

162:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
163:         cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
164:       }
165:     } else {
166:       cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
167:     }
168:     PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
169:     PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);

171:     A->offloadmask = PETSC_OFFLOAD_BOTH;
172:   }
173:   return(0);
174: }

176: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A,Mat B,MatStructure str)
177: {
178:   Mat_SeqDense      *a = (Mat_SeqDense*)A->data,*b = (Mat_SeqDense*)B->data;
179:   PetscErrorCode    ierr;
180:   const PetscScalar *va;
181:   PetscScalar       *vb;
182:   PetscInt          lda1=a->lda,lda2=b->lda, m=A->rmap->n,n=A->cmap->n, j;
183:   cudaError_t       cerr;

186:   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
187:   if (A->ops->copy != B->ops->copy) {
188:     MatCopy_Basic(A,B,str);
189:     return(0);
190:   }
191:   if (m != B->rmap->n || n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"size(B) != size(A)");
192:   MatDenseCUDAGetArrayRead(A,&va);
193:   MatDenseCUDAGetArrayWrite(B,&vb);
194:   PetscLogGpuTimeBegin();
195:   if (lda1>m || lda2>m) {
196:     for (j=0; j<n; j++) {
197:       cerr = cudaMemcpy(vb+j*lda2,va+j*lda1,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
198:     }
199:   } else {
200:     cerr = cudaMemcpy(vb,va,m*(n*sizeof(PetscScalar)),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
201:   }
202:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
203:   PetscLogGpuTimeEnd();
204:   MatDenseCUDARestoreArray(B,&vb);
205:   MatDenseCUDARestoreArrayRead(A,&va);
206:   return(0);
207: }

209: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
210: {
211:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
212:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
213:   PetscErrorCode   ierr;

216:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
217:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
218:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
219:   if (aa->v) { MatSeqDenseCUDACopyToGPU(A); }
220:   dA->unplacedarray = dA->d_v;
221:   dA->unplaced_user_alloc = dA->user_alloc;
222:   dA->d_v = (PetscScalar*)a;
223:   dA->user_alloc = PETSC_TRUE;
224:   return(0);
225: }

227: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
228: {
229:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
230:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
231:   PetscErrorCode   ierr;

234:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
235:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
236:   if (a->v) { MatSeqDenseCUDACopyToGPU(A); }
237:   dA->d_v = dA->unplacedarray;
238:   dA->user_alloc = dA->unplaced_user_alloc;
239:   dA->unplacedarray = NULL;
240:   return(0);
241: }

243: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
244: {
245:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
246:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
247:   cudaError_t      cerr;

250:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
251:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
252:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
253:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
254:   dA->d_v = (PetscScalar*)a;
255:   dA->user_alloc = PETSC_FALSE;
256:   return(0);
257: }

259: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
260: {
261:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
262:   PetscErrorCode   ierr;

265:   if (!dA->d_v) {
266:     MatSeqDenseCUDASetPreallocation(A,NULL);
267:   }
268:   *a = dA->d_v;
269:   return(0);
270: }

272: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
273: {
275:   *a = NULL;
276:   return(0);
277: }

279: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
280: {
281:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
282:   PetscErrorCode   ierr;

285:   MatSeqDenseCUDACopyToGPU(A);
286:   *a   = dA->d_v;
287:   return(0);
288: }

290: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
291: {
293:   *a = NULL;
294:   return(0);
295: }

297: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
298: {
299:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
300:   PetscErrorCode   ierr;

303:   MatSeqDenseCUDACopyToGPU(A);
304:   *a   = dA->d_v;
305:   return(0);
306: }

308: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
309: {
311:   *a = NULL;
312:   return(0);
313: }

315: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
316: {
317: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
318:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
319:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
320:   PetscScalar        *da;
321:   PetscErrorCode     ierr;
322:   cudaError_t        ccer;
323:   cusolverStatus_t   cerr;
324:   cusolverDnHandle_t handle;
325:   int                n,lda;
326: #if defined(PETSC_USE_DEBUG)
327:   int                info;
328: #endif

331:   if (!A->rmap->n || !A->cmap->n) return(0);
332:   PetscCUSOLVERDnGetHandle(&handle);
333:   PetscMPIIntCast(A->cmap->n,&n);
334:   PetscMPIIntCast(a->lda,&lda);
335:   if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
336:   else if (A->factortype == MAT_FACTOR_CHOLESKY) {
337:     if (!dA->d_fact_ipiv) { /* spd */
338:       int il;

340:       MatDenseCUDAGetArray(A,&da);
341:       cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
342:       if (il > dA->fact_lwork) {
343:         dA->fact_lwork = il;

345:         ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
346:         ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
347:       }
348:       PetscLogGpuTimeBegin();
349:       cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
350:       ccer = WaitForCUDA();CHKERRCUDA(ccer);
351:       PetscLogGpuTimeEnd();
352:       MatDenseCUDARestoreArray(A,&da);
353:       /* TODO (write cuda kernel) */
354:       MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
355:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
356:   }
357: #if defined(PETSC_USE_DEBUG)
358:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
359:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
360:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
361: #endif
362:   PetscLogGpuFlops(1.0*n*n*n/3.0);
363:   A->ops->solve          = NULL;
364:   A->ops->solvetranspose = NULL;
365:   A->ops->matsolve       = NULL;
366:   A->factortype          = MAT_FACTOR_NONE;

368:   PetscFree(A->solvertype);
369:   return(0);
370: #else
371:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
372: #endif
373: }

375: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
376: {
377:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
378:   Mat_SeqDense       *x = (Mat_SeqDense*)X->data;
379:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
380:   const PetscScalar  *da;
381:   PetscScalar        *dx;
382:   cusolverDnHandle_t handle;
383:   PetscBool          iscuda;
384:   int                nrhs,n,lda,ldx;
385: #if defined(PETSC_USE_DEBUG)
386:   int                info;
387: #endif
388:   cudaError_t        ccer;
389:   cusolverStatus_t   cerr;
390:   PetscErrorCode     ierr;

393:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
394:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
395:   PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
396:   if (X != B) {
397:     MatCopy(B,X,SAME_NONZERO_PATTERN);
398:   }
399:   MatDenseCUDAGetArrayRead(A,&da);
400:   /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
401:   PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
402:   if (!iscuda) {
403:     MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
404:   }
405:   MatDenseCUDAGetArray(X,&dx);
406:   PetscMPIIntCast(A->rmap->n,&n);
407:   PetscMPIIntCast(X->cmap->n,&nrhs);
408:   PetscMPIIntCast(a->lda,&lda);
409:   PetscMPIIntCast(x->lda,&ldx);
410:   PetscCUSOLVERDnGetHandle(&handle);
411:   PetscLogGpuTimeBegin();
412:   if (A->factortype == MAT_FACTOR_LU) {
413:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
414:     cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
415:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
416:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
417:     if (!dA->d_fact_ipiv) { /* spd */
418:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
419:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
420:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
421:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
422:   ccer = WaitForCUDA();CHKERRCUDA(ccer);
423:   PetscLogGpuTimeEnd();
424:   MatDenseCUDARestoreArrayRead(A,&da);
425:   MatDenseCUDARestoreArray(X,&dx);
426:   if (!iscuda) {
427:     MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
428:   }
429: #if defined(PETSC_USE_DEBUG)
430:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
431:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
432:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
433: #endif
434:   PetscLogGpuFlops(nrhs*(2.0*n*n - n));
435:   return(0);
436: }

438: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
439: {
440:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
441:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
442:   const PetscScalar  *da;
443:   PetscScalar        *y;
444:   cusolverDnHandle_t handle;
445:   int                one = 1,n,lda;
446: #if defined(PETSC_USE_DEBUG)
447:   int                info;
448: #endif
449:   cudaError_t        ccer;
450:   cusolverStatus_t   cerr;
451:   PetscBool          iscuda;
452:   PetscErrorCode     ierr;

455:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
456:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
457:   PetscMPIIntCast(A->rmap->n,&n);
458:   /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
459:   PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
460:   if (iscuda) {
461:     VecCopy(xx,yy);
462:     VecCUDAGetArray(yy,&y);
463:   } else {
464:     if (!dA->workvec) {
465:       MatCreateVecs(A,&dA->workvec,NULL);
466:     }
467:     VecCopy(xx,dA->workvec);
468:     VecCUDAGetArray(dA->workvec,&y);
469:   }
470:   MatDenseCUDAGetArrayRead(A,&da);
471:   PetscMPIIntCast(a->lda,&lda);
472:   PetscCUSOLVERDnGetHandle(&handle);
473:   PetscLogGpuTimeBegin();
474:   if (A->factortype == MAT_FACTOR_LU) {
475:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
476:     cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
477:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
478:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
479:     if (!dA->d_fact_ipiv) { /* spd */
480:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
481:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
482:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
483:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
484:   ccer = WaitForCUDA();CHKERRCUDA(ccer);
485:   PetscLogGpuTimeEnd();
486:   if (iscuda) {
487:     VecCUDARestoreArray(yy,&y);
488:   } else {
489:     VecCUDARestoreArray(dA->workvec,&y);
490:     VecCopy(dA->workvec,yy);
491:   }
492:   MatDenseCUDARestoreArrayRead(A,&da);
493: #if defined(PETSC_USE_DEBUG)
494:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
495:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
496:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
497: #endif
498:   PetscLogGpuFlops(2.0*n*n - n);
499:   return(0);
500: }

502: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
503: {
504:   PetscErrorCode     ierr;

507:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
508:   return(0);
509: }

511: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
512: {
513:   PetscErrorCode     ierr;

516:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
517:   return(0);
518: }

520: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
521: {
522:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
523:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
524:   PetscScalar        *da;
525:   int                m,n,lda;
526: #if defined(PETSC_USE_DEBUG)
527:   int                info;
528: #endif
529:   cusolverStatus_t   cerr;
530:   cusolverDnHandle_t handle;
531:   cudaError_t        ccer;
532:   PetscErrorCode     ierr;

535:   if (!A->rmap->n || !A->cmap->n) return(0);
536:   PetscCUSOLVERDnGetHandle(&handle);
537:   MatDenseCUDAGetArray(A,&da);
538:   PetscMPIIntCast(A->cmap->n,&n);
539:   PetscMPIIntCast(A->rmap->n,&m);
540:   PetscMPIIntCast(a->lda,&lda);
541:   PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
542:   if (!dA->d_fact_ipiv) {
543:     ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
544:   }
545:   if (!dA->fact_lwork) {
546:     cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
547:     ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
548:   }
549:   if (!dA->d_fact_info) {
550:     ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
551:   }
552:   PetscLogGpuTimeBegin();
553:   cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
554:   ccer = WaitForCUDA();CHKERRCUDA(ccer);
555:   PetscLogGpuTimeEnd();
556:   MatDenseCUDARestoreArray(A,&da);
557: #if defined(PETSC_USE_DEBUG)
558:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
559:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
560:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
561: #endif
562:   A->factortype = MAT_FACTOR_LU;
563:   PetscLogGpuFlops(2.0*n*n*m/3.0);

565:   A->ops->solve          = MatSolve_SeqDenseCUDA;
566:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
567:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

569:   PetscFree(A->solvertype);
570:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
571:   return(0);
572: }

574: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
575: {
576:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
577:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
578:   PetscScalar        *da;
579:   int                n,lda;
580: #if defined(PETSC_USE_DEBUG)
581:   int                info;
582: #endif
583:   cusolverStatus_t   cerr;
584:   cusolverDnHandle_t handle;
585:   cudaError_t        ccer;
586:   PetscErrorCode     ierr;

589:   if (!A->rmap->n || !A->cmap->n) return(0);
590:   PetscCUSOLVERDnGetHandle(&handle);
591:   PetscMPIIntCast(A->rmap->n,&n);
592:   PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
593:   if (A->spd) {
594:     MatDenseCUDAGetArray(A,&da);
595:     PetscMPIIntCast(a->lda,&lda);
596:     if (!dA->fact_lwork) {
597:       cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
598:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
599:     }
600:     if (!dA->d_fact_info) {
601:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
602:     }
603:     PetscLogGpuTimeBegin();
604:     cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
605:     ccer = WaitForCUDA();CHKERRCUDA(ccer);
606:     PetscLogGpuTimeEnd();

608:     MatDenseCUDARestoreArray(A,&da);
609: #if defined(PETSC_USE_DEBUG)
610:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
611:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
612:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
613: #endif
614:     A->factortype = MAT_FACTOR_CHOLESKY;
615:     PetscLogGpuFlops(1.0*n*n*n/3.0);
616:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
617: #if 0
618:     /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
619:        The code below should work, and it can be activated when *sytrs routines will be available */
620:     if (!dA->d_fact_ipiv) {
621:       ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
622:     }
623:     if (!dA->fact_lwork) {
624:       cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
625:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
626:     }
627:     if (!dA->d_fact_info) {
628:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
629:     }
630:     PetscLogGpuTimeBegin();
631:     cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
632:     PetscLogGpuTimeEnd();
633: #endif

635:   A->ops->solve          = MatSolve_SeqDenseCUDA;
636:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
637:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

639:   PetscFree(A->solvertype);
640:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
641:   return(0);
642: }

644: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
645: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
646: {
647:   const PetscScalar *da,*db;
648:   PetscScalar       *dc;
649:   PetscScalar       one=1.0,zero=0.0;
650:   int               m,n,k;
651:   PetscInt          alda,blda,clda;
652:   PetscErrorCode    ierr;
653:   cublasHandle_t    cublasv2handle;
654:   PetscBool         Aiscuda,Biscuda;
655:   cublasStatus_t    berr;
656:   cudaError_t       cerr;

659:   /* we may end up with SEQDENSE as one of the arguments */
660:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
661:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
662:   if (!Aiscuda) {
663:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
664:   }
665:   if (!Biscuda) {
666:     MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
667:   }
668:   PetscMPIIntCast(C->rmap->n,&m);
669:   PetscMPIIntCast(C->cmap->n,&n);
670:   if (tA) {
671:     PetscMPIIntCast(A->rmap->n,&k);
672:   } else {
673:     PetscMPIIntCast(A->cmap->n,&k);
674:   }
675:   if (!m || !n || !k) return(0);
676:   PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
677:   MatDenseCUDAGetArrayRead(A,&da);
678:   MatDenseCUDAGetArrayRead(B,&db);
679:   MatDenseCUDAGetArrayWrite(C,&dc);
680:   MatDenseGetLDA(A,&alda);
681:   MatDenseGetLDA(B,&blda);
682:   MatDenseGetLDA(C,&clda);
683:   PetscCUBLASGetHandle(&cublasv2handle);
684:   PetscLogGpuTimeBegin();
685:   berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
686:                      m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
687:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
688:   PetscLogGpuTimeEnd();
689:   PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
690:   MatDenseCUDARestoreArrayRead(A,&da);
691:   MatDenseCUDARestoreArrayRead(B,&db);
692:   MatDenseCUDARestoreArrayWrite(C,&dc);
693:   if (!Aiscuda) {
694:     MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
695:   }
696:   if (!Biscuda) {
697:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
698:   }
699:   return(0);
700: }

702: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
703: {

707:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
708:   return(0);
709: }

711: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
712: {

716:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
717:   return(0);
718: }

720: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
721: {

725:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
726:   return(0);
727: }

729: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
730: {

734:   MatProductSetFromOptions_SeqDense(C);
735:   return(0);
736: }

738: /* zz = op(A)*xx + yy
739:    if yy == NULL, only MatMult */
740: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
741: {
742:   Mat_SeqDense      *mat = (Mat_SeqDense*)A->data;
743:   const PetscScalar *xarray,*da;
744:   PetscScalar       *zarray;
745:   PetscScalar       one=1.0,zero=0.0;
746:   int               m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
747:   cublasHandle_t    cublasv2handle;
748:   cublasStatus_t    berr;
749:   PetscErrorCode    ierr;

752:   if (yy && yy != zz) { /* mult add */
753:     VecCopy_SeqCUDA(yy,zz);
754:   }
755:   if (!A->rmap->n || !A->cmap->n) {
756:     if (!yy) { /* mult only */
757:       VecSet_SeqCUDA(zz,0.0);
758:     }
759:     return(0);
760:   }
761:   PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
762:   PetscMPIIntCast(A->rmap->n,&m);
763:   PetscMPIIntCast(A->cmap->n,&n);
764:   PetscMPIIntCast(mat->lda,&lda);
765:   PetscCUBLASGetHandle(&cublasv2handle);
766:   MatDenseCUDAGetArrayRead(A,&da);
767:   VecCUDAGetArrayRead(xx,&xarray);
768:   VecCUDAGetArray(zz,&zarray);
769:   PetscLogGpuTimeBegin();
770:   berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
771:                      m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
772:   PetscLogGpuTimeEnd();
773:   PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
774:   VecCUDARestoreArrayRead(xx,&xarray);
775:   VecCUDARestoreArray(zz,&zarray);
776:   MatDenseCUDARestoreArrayRead(A,&da);
777:   return(0);
778: }

780: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
781: {

785:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
786:   return(0);
787: }

789: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
790: {

794:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
795:   return(0);
796: }

798: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
799: {

803:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
804:   return(0);
805: }

807: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
808: {

812:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
813:   return(0);
814: }

816: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
817: {
818:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

822:   MatSeqDenseCUDACopyFromGPU(A);
823:   *array = mat->v;
824:   return(0);
825: }

827: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
828: {
829:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

833:   if (!mat->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
834:     MatSeqDenseSetPreallocation(A,NULL);
835:   }
836:   *array = mat->v;
837:   A->offloadmask = PETSC_OFFLOAD_CPU;
838:   return(0);
839: }

841: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
842: {
843:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

847:   MatSeqDenseCUDACopyFromGPU(A);
848:   *array = mat->v;
849:   A->offloadmask = PETSC_OFFLOAD_CPU;
850:   return(0);
851: }

853: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
854: {
855:   Mat_SeqDense   *y = (Mat_SeqDense*)Y->data;
856:   PetscScalar    *dy;
857:   int            j,N,m,lday,one = 1;
858:   cublasHandle_t cublasv2handle;
859:   cublasStatus_t berr;
861:   cudaError_t    cerr;

864:   PetscCUBLASGetHandle(&cublasv2handle);
865:   MatDenseCUDAGetArray(Y,&dy);
866:   PetscMPIIntCast(Y->rmap->n*Y->cmap->n,&N);
867:   PetscMPIIntCast(Y->rmap->n,&m);
868:   PetscMPIIntCast(y->lda,&lday);
869:   PetscInfo2(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
870:   PetscLogGpuTimeBegin();
871:   if (lday>m) {
872:     for (j=0; j<Y->cmap->n; j++) {
873:       berr = cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);CHKERRCUBLAS(berr);
874:     }
875:   } else {
876:     berr = cublasXscal(cublasv2handle,N,&alpha,dy,one);CHKERRCUBLAS(berr);
877:   }
878:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
879:   PetscLogGpuTimeEnd();
880:   PetscLogGpuFlops(N);
881:   MatDenseCUDARestoreArray(Y,&dy);
882:   return(0);
883: }

885: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
886: {
887:   Mat_SeqDense      *x = (Mat_SeqDense*)X->data;
888:   Mat_SeqDense      *y = (Mat_SeqDense*)Y->data;
889:   const PetscScalar *dx;
890:   PetscScalar       *dy;
891:   int               j,N,m,ldax,lday,one = 1;
892:   cublasHandle_t    cublasv2handle;
893:   cublasStatus_t    berr;
894:   PetscErrorCode    ierr;
895:   cudaError_t       cerr;

898:   if (!X->rmap->n || !X->cmap->n) return(0);
899:   PetscCUBLASGetHandle(&cublasv2handle);
900:   MatDenseCUDAGetArrayRead(X,&dx);
901:   if (alpha != 0.0) {
902:     MatDenseCUDAGetArray(Y,&dy);
903:   } else {
904:     MatDenseCUDAGetArrayWrite(Y,&dy);
905:   }
906:   PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
907:   PetscMPIIntCast(X->rmap->n,&m);
908:   PetscMPIIntCast(x->lda,&ldax);
909:   PetscMPIIntCast(y->lda,&lday);
910:   PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
911:   PetscLogGpuTimeBegin();
912:   if (ldax>m || lday>m) {
913:     for (j=0; j<X->cmap->n; j++) {
914:       berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
915:     }
916:   } else {
917:     berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
918:   }
919:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
920:   PetscLogGpuTimeEnd();
921:   PetscLogGpuFlops(PetscMax(2.*N-1,0));
922:   MatDenseCUDARestoreArrayRead(X,&dx);
923:   if (alpha != 0.0) {
924:     MatDenseCUDARestoreArray(Y,&dy);
925:   } else {
926:     MatDenseCUDARestoreArrayWrite(Y,&dy);
927:   }
928:   return(0);
929: }

931: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
932: {
933:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
934:   cudaError_t      cerr;
935:   PetscErrorCode   ierr;

938:   if (dA) {
939:     if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
940:     if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
941:     cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
942:     cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
943:     cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
944:     VecDestroy(&dA->workvec);
945:   }
946:   PetscFree(A->spptr);
947:   return(0);
948: }

950: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
951: {
952:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

956:   /* prevent to copy back data if we own the data pointer */
957:   if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
958:   MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
959:   MatDestroy_SeqDense(A);
960:   return(0);
961: }

963: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
964: {

968:   MatCreate(PetscObjectComm((PetscObject)A),B);
969:   MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
970:   MatSetType(*B,((PetscObject)A)->type_name);
971:   MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
972:   if (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) {
973:     Mat_SeqDense      *a = (Mat_SeqDense*)A->data;
974:     const PetscScalar *da;
975:     PetscScalar       *db;
976:     cudaError_t       cerr;
977:     PetscInt          ldb;

979:     MatDenseCUDAGetArrayRead(A,&da);
980:     MatDenseCUDAGetArrayWrite(*B,&db);
981:     MatDenseGetLDA(*B,&ldb);
982:     if (a->lda > A->rmap->n || ldb > A->rmap->n) {
983:       PetscInt j,m = A->rmap->n;

985:       for (j=0; j<A->cmap->n; j++) { /* it can be done better */
986:         cerr = cudaMemcpy(db+j*ldb,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
987:       }
988:     } else {
989:       cerr = cudaMemcpy(db,da,(sizeof(PetscScalar)*A->cmap->n)*A->rmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
990:     }
991:     MatDenseCUDARestoreArrayRead(A,&da);
992:     MatDenseCUDARestoreArrayWrite(*B,&db);
993:     (*B)->offloadmask = PETSC_OFFLOAD_BOTH;
994:   }
995:   return(0);
996: }

998: #include <petsc/private/vecimpl.h>

1000: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
1001: {
1002:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
1003:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1004:   PetscErrorCode   ierr;
1005:   PetscScalar      *x;
1006:   PetscBool        viscuda;
1007:   cudaError_t      cerr;

1010:   PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
1011:   if (viscuda && !v->boundtocpu) { /* update device data */
1012:     VecCUDAGetArrayWrite(v,&x);
1013:     if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1014:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);CHKERRCUDA(cerr);
1015:     } else {
1016:       cerr = cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1017:     }
1018:     VecCUDARestoreArrayWrite(v,&x);
1019:   } else { /* update host data */
1020:     VecGetArrayWrite(v,&x);
1021:     if (A->offloadmask & PETSC_OFFLOAD_CPU) {
1022:       PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
1023:     } else {
1024:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1025:     }
1026:     VecRestoreArrayWrite(v,&x);
1027:   }
1028:   return(0);
1029: }

1031: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
1032: {

1036:   MatCreate(PetscObjectComm((PetscObject)A),fact);
1037:   MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1038:   MatSetType(*fact,MATSEQDENSECUDA);
1039:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1040:     (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1041:   } else {
1042:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1043:   }
1044:   (*fact)->factortype = ftype;
1045:   PetscFree((*fact)->solvertype);
1046:   PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1047:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_LU]);
1048:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ILU]);
1049:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]);
1050:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ICC]);
1051:   return(0);
1052: }

1054: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1055: {
1056:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1060:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1061:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1062:   MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1063:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1064:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1065:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1066:   }
1067:   a->vecinuse = col + 1;
1068:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1069:   *v   = a->cvec;
1070:   return(0);
1071: }

1073: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1074: {
1075:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1079:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1080:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1081:   a->vecinuse = 0;
1082:   VecCUDAResetArray(a->cvec);
1083:   MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1084:   *v   = NULL;
1085:   return(0);
1086: }

1088: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1089: {
1090:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1094:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1095:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1096:   MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1097:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1098:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1099:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1100:   }
1101:   a->vecinuse = col + 1;
1102:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1103:   VecLockReadPush(a->cvec);
1104:   *v   = a->cvec;
1105:   return(0);
1106: }

1108: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1109: {
1110:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1114:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1115:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1116:   a->vecinuse = 0;
1117:   VecLockReadPop(a->cvec);
1118:   VecCUDAResetArray(a->cvec);
1119:   MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1120:   *v   = NULL;
1121:   return(0);
1122: }

1124: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1125: {
1126:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1130:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1131:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1132:   MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1133:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1134:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1135:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1136:   }
1137:   a->vecinuse = col + 1;
1138:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1139:   *v   = a->cvec;
1140:   return(0);
1141: }

1143: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1144: {
1145:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1149:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1150:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1151:   a->vecinuse = 0;
1152:   VecCUDAResetArray(a->cvec);
1153:   MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1154:   *v   = NULL;
1155:   return(0);
1156: }

1158: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1159: {
1160:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
1161:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1162:   PetscErrorCode   ierr;

1165:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1166:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1167:   if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1168:     MatDestroy(&a->cmat);
1169:   }
1170:   MatSeqDenseCUDACopyToGPU(A);
1171:   if (!a->cmat) {
1172:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v + (size_t)cbegin * (size_t)a->lda,&a->cmat);
1173:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1174:   } else {
1175:     MatDenseCUDAPlaceArray(a->cmat,dA->d_v + (size_t)cbegin * (size_t)a->lda);
1176:   }
1177:   MatDenseSetLDA(a->cmat,a->lda);
1178:   if (a->v) { MatDensePlaceArray(a->cmat,a->v + (size_t)cbegin * (size_t)a->lda); }
1179:   a->cmat->offloadmask = A->offloadmask;
1180:   a->matinuse = cbegin + 1;
1181:   *v = a->cmat;
1182:   return(0);
1183: }

1185: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1186: {
1187:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1191:   if (!a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetSubMatrix() first");
1192:   if (!a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column matrix");
1193:   if (*v != a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Not the matrix obtained from MatDenseGetSubMatrix()");
1194:   a->matinuse = 0;
1195:   A->offloadmask = PETSC_OFFLOAD_GPU;
1196:   MatDenseCUDAResetArray(a->cmat);
1197:   MatDenseResetArray(a->cmat);
1198:   *v = NULL;
1199:   return(0);
1200: }

1202: static PetscErrorCode  MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1203: {
1204:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
1205:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1206:   PetscBool        data;

1209:   data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1210:   if (!dA->user_alloc && data && cA->lda!=lda) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"LDA cannot be changed after allocation of internal storage");
1211:   if (lda < A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"LDA %D must be at least matrix dimension %D",lda,A->rmap->n);
1212:   cA->lda = lda;
1213:   return(0);
1214: }

1216: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1217: {
1218:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1222:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1223:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1224:   A->boundtocpu = flg;
1225:   if (!flg) {
1226:     PetscBool iscuda;

1228:     PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1229:     if (!iscuda) {
1230:       VecDestroy(&a->cvec);
1231:     }
1232:     PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1233:     if (!iscuda) {
1234:       MatDestroy(&a->cmat);
1235:     }
1236:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1237:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1238:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1239:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1240:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1241:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1242:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1243:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1244:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1245:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1246:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1247:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);

1249:     A->ops->duplicate               = MatDuplicate_SeqDenseCUDA;
1250:     A->ops->mult                    = MatMult_SeqDenseCUDA;
1251:     A->ops->multadd                 = MatMultAdd_SeqDenseCUDA;
1252:     A->ops->multtranspose           = MatMultTranspose_SeqDenseCUDA;
1253:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseCUDA;
1254:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1255:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1256:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1257:     A->ops->axpy                    = MatAXPY_SeqDenseCUDA;
1258:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseCUDA;
1259:     A->ops->lufactor                = MatLUFactor_SeqDenseCUDA;
1260:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseCUDA;
1261:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDenseCUDA;
1262:     A->ops->scale                   = MatScale_SeqDenseCUDA;
1263:     A->ops->copy                    = MatCopy_SeqDenseCUDA;
1264:   } else {
1265:     /* make sure we have an up-to-date copy on the CPU */
1266:     MatSeqDenseCUDACopyFromGPU(A);
1267:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1268:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1269:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1270:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1271:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1272:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1273:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1274:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1275:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1276:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1277:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1278:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);

1280:     A->ops->duplicate               = MatDuplicate_SeqDense;
1281:     A->ops->mult                    = MatMult_SeqDense;
1282:     A->ops->multadd                 = MatMultAdd_SeqDense;
1283:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
1284:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
1285:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1286:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
1287:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1288:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1289:     A->ops->axpy                    = MatAXPY_SeqDense;
1290:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
1291:     A->ops->lufactor                = MatLUFactor_SeqDense;
1292:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1293:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDense;
1294:     A->ops->scale                   = MatScale_SeqDense;
1295:     A->ops->copy                    = MatCopy_SeqDense;
1296:   }
1297:   if (a->cmat) {
1298:     MatBindToCPU(a->cmat,flg);
1299:   }
1300:   return(0);
1301: }

1303: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1304: {
1305:   Mat              B;
1306:   PetscErrorCode   ierr;

1309:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1310:     /* TODO these cases should be optimized */
1311:     MatConvert_Basic(M,type,reuse,newmat);
1312:     return(0);
1313:   }

1315:   B    = *newmat;
1316:   MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1317:   MatReset_SeqDenseCUDA(B);
1318:   PetscFree(B->defaultvectype);
1319:   PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1320:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1321:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1322:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1323:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1324:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1325:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1326:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1327:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1328:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1329:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1330:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1331:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);

1333:   B->ops->bindtocpu = NULL;
1334:   B->ops->destroy = MatDestroy_SeqDense;
1335:   B->offloadmask = PETSC_OFFLOAD_CPU;
1336:   return(0);
1337: }

1339: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1340: {
1341:   Mat_SeqDenseCUDA *dB;
1342:   Mat              B;
1343:   PetscErrorCode   ierr;

1346:   PetscCUDAInitializeCheck();
1347:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1348:     /* TODO these cases should be optimized */
1349:     MatConvert_Basic(M,type,reuse,newmat);
1350:     return(0);
1351:   }

1353:   B    = *newmat;
1354:   PetscFree(B->defaultvectype);
1355:   PetscStrallocpy(VECCUDA,&B->defaultvectype);
1356:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1357:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",            MatConvert_SeqDenseCUDA_SeqDense);
1358:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",                        MatDenseCUDAGetArray_SeqDenseCUDA);
1359:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",                    MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1360:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",                   MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1361:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",                    MatDenseCUDARestoreArray_SeqDenseCUDA);
1362:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",                MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1363:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",               MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1364:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",                      MatDenseCUDAPlaceArray_SeqDenseCUDA);
1365:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",                      MatDenseCUDAResetArray_SeqDenseCUDA);
1366:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",                    MatDenseCUDAReplaceArray_SeqDenseCUDA);
1367:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);

1369:   PetscNewLog(B,&dB);
1370:   B->spptr = dB;

1372:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1374:   MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1375:   B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1376:   B->ops->destroy  = MatDestroy_SeqDenseCUDA;
1377:   return(0);
1378: }

1380: /*@C
1381:    MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.

1383:    Collective

1385:    Input Parameters:
1386: +  comm - MPI communicator
1387: .  m - number of rows
1388: .  n - number of columns
1389: -  data - optional location of GPU matrix data.  Set data=NULL for PETSc
1390:    to control matrix memory allocation.

1392:    Output Parameter:
1393: .  A - the matrix

1395:    Notes:

1397:    Level: intermediate

1399: .seealso: MatCreate(), MatCreateSeqDense()
1400: @*/
1401: PetscErrorCode  MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1402: {
1404:   PetscMPIInt    size;

1407:   MPI_Comm_size(comm,&size);
1408:   if (size > 1) SETERRQ1(comm,PETSC_ERR_ARG_WRONG,"Invalid communicator size %d",size);
1409:   MatCreate(comm,A);
1410:   MatSetSizes(*A,m,n,m,n);
1411:   MatSetType(*A,MATSEQDENSECUDA);
1412:   MatSeqDenseCUDASetPreallocation(*A,data);
1413:   return(0);
1414: }

1416: /*MC
1417:    MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.

1419:    Options Database Keys:
1420: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()

1422:   Level: beginner
1423: M*/
1424: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1425: {

1429:   PetscCUDAInitializeCheck();
1430:   MatCreate_SeqDense(B);
1431:   MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1432:   return(0);
1433: }