Actual source code: aijcusparseband.cu

petsc-main 2021-04-20
Report Typos and Errors
  1: /*
  2:   AIJCUSPARSE methods implemented with Cuda kernels. Uses cuSparse/Thrust maps from AIJCUSPARSE
  3: */
  4: #define PETSC_SKIP_SPINLOCK
  5: #define PETSC_SKIP_CXX_COMPLEX_FIX
  6: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  8: #include <petscconf.h>
  9: #include <../src/mat/impls/aij/seq/aij.h>
 10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 11: #undef VecType
 12: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 13: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 14: #include <cooperative_groups.h>
 15: #endif

 17: #define CHECK_LAUNCH_ERROR()                                                             \
 18: do {                                                                                     \
 19:   /* Check synchronous errors, i.e. pre-launch */                                        \
 20:   cudaError_t err = cudaGetLastError();                                                  \
 21:   if (cudaSuccess != err) {                                                              \
 22:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 23:   }                                                                                      \
 24:   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
 25:   err = cudaDeviceSynchronize();                                                         \
 26:   if (cudaSuccess != err) {                                                              \
 27:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 28:   }                                                                                      \
 29:  } while (0)

 31: /*
 32:   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)

 34:   requires:
 35:      structurally symmetric: fix with transpose/column meta data
 36: */

 38: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
 39: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);

 41: /*
 42:   The GPU LU factor kernel
 43: */
 44: __global__
 45: void __launch_bounds__(1024,1)
 46: mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
 47: {
 48:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
 49:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
 50:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

 52:   // set i (row+1)
 53:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
 54:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 55:     if (rowb < end_i && threadIdx.x==0) {
 56:       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
 57:       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
 58:     }
 59:   }
 60: }
 61: // copy AIJ to AIJ_BAND
 62: __global__
 63: void __launch_bounds__(1024,1)
 64: mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
 65:                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
 66:                                 const int bi_csr[], PetscScalar ba_csr[])
 67: {
 68:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
 69:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
 70:   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);

 72:   // zero B
 73:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
 74:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 75:     if (rowb < end_i) {
 76:       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
 77:       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
 78:       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
 79:         if (j<nzb) {
 80:           batmp[j] = 0;
 81:         }
 82:       }
 83:     }
 84:   }

 86:   // copy A into B with CSR format -- these two loops can be fused
 87:   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
 88:     if (rowb < end_i) {
 89:       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
 90:       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
 91:       const PetscScalar *av    = aa_d + ai_d[rowa];
 92:       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
 93:       /* load in initial (unfactored row) */
 94:       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
 95:         if (j<nza) {
 96:           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
 97:           PetscScalar vala = av[j];
 98:           batmp[idx] = vala;
 99:         }
100:       }
101:     }
102:   }
103: }
104: // print AIJ_BAND
105: __global__
106: void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
107: {
108:   // debug
109:   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
110:     printf("B (AIJ) n=%d:\n",(int)n);
111:     for (int rowb=0;rowb<n;rowb++) {
112:       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
113:       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
114:       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
115:       printf(" bi=%d\n",bi_csr[rowb+1]);
116:     }
117:   }
118: }
119: // Band LU kernel ---  ba_csr bi_csr
120: __global__
121: void __launch_bounds__(1024,1)
122:   mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[], int *use_group_sync)
123: {
124:   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
125:   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
126:   const PetscInt  start = field*nloc, end = start + nloc;
127: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
128:   auto g = cooperative_groups::this_grid();
129: #endif
130:   // A22 panel update for each row A(1,:) and col A(:,1)
131:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
132:     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
133:     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
134:     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
135:     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
136:     const PetscScalar Bdd = *pBdd;
137:     const PetscInt    offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
138:     if (threadIdx.x==0) {
139:       for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) { /* assuming symmetric structure */
140:         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
141:         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
142:         *Aid = *Aid/Bdd;
143:       }
144:     }
145:     __syncthreads(); // synch on threadIdx.x only
146:     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) {
147:       const PetscInt    bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
148:       PetscScalar       *Aid = ba_csr + bi_csr[myi] + kIdx;
149:       PetscScalar       *Aij =  Aid + 1;
150:       const PetscScalar Lid  = *Aid;
151:       for (int jIdx=threadIdx.x ; jIdx<nzUd; jIdx += blockDim.x) {
152:         Aij[jIdx] -= Lid*baUd[jIdx];
153:       }
154:     }
155: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
156:     if (use_group_sync) {
157:       g.sync();
158:     } else {
159:       __syncthreads();
160:     }
161: #else
162:     __syncthreads();
163: #endif
164:   } /* endof for (i=0; i<n; i++) { */
165: }

167: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
168: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
169: {
170:   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
171:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
172:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
173:   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
174:   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
175:   CsrMatrix                    *matrixA;
176:   PetscErrorCode               ierr;
177:   cudaError_t                  cerr;
178:   const PetscInt               n=A->rmap->n, *ic, *r;
179:   const int                    *ai_d, *aj_d;
180:   const PetscScalar            *aa_d;
181:   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
182:   int                          *bi_t = cusparseTriFactors->i_band_d;
183:   PetscContainer               container;
184:   int                          Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;

187:   if (A->rmap->n == 0) {
188:     return(0);
189:   }
190:   // cusparse setup
191:   if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
192:   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
193:   if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
194:   matrixA = (CsrMatrix*)matstructA->mat;
195:   if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");

197:   // factor: get Nf if available
198:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
199:   if (container) {
200:     PetscInt *pNf=NULL;
201:     PetscContainerGetPointer(container, (void **) &pNf);
202:     Nf = (*pNf)%1000;
203:     if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
204:   } else Nf = 1;
205:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

207:   // get data
208:   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
209:   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
210:   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
211:   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
212:   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());

214:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
215:   PetscLogGpuTimeBegin();
216:   {
217:     int bw = (int)(2.*(double)n-1. - (double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
218: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
219:     Ni = 1/nconcurrent;
220:     Ni = 1;
221: #else
222:     if (!cusparseTriFactors->init_dev_prop) {
223:       int gpuid;
224:       cusparseTriFactors->init_dev_prop = PETSC_TRUE;
225:       cudaGetDevice(&gpuid);
226:       cudaGetDeviceProperties(&cusparseTriFactors->dev_prop, gpuid);
227:     }
228:     nsm = cusparseTriFactors->dev_prop.multiProcessorCount;
229:     Ni = nsm/Nf/nconcurrent;
230: #endif
231:     team_size = bw/Ni + !!(bw%Ni);
232:     nVec = PetscMin(bw, 1024/team_size);
233:     PetscInfo7(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d, thread group size = %d,%d\n",bw,Ni,nconcurrent,Nf,nsm,team_size,nVec);
234:     {
235:       dim3 dimBlockTeam(nVec,team_size);
236:       dim3 dimBlockLeague(Nf,Ni);
237:       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
238:       CHECK_LAUNCH_ERROR(); // does a sync
239: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
240:       if (Ni > 1) {
241:         void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t, (void*)&nsm };
242:         cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, 0, NULL);
243:       } else {
244:         mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
245:       }
246: #else
247:       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
248: #endif
249:       CHECK_LAUNCH_ERROR(); // does a sync
250: #if defined(PETSC_USE_LOG)
251:       PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));
252: #endif
253:     }
254:   }
255:   PetscLogGpuTimeEnd();
256:   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
257:   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
258:   B->ops->solvetranspose = NULL; // need transpose
259:   B->ops->matsolve = NULL;
260:   B->ops->matsolvetranspose = NULL;
261:   return(0);
262: }

264: static PetscErrorCode MatrixNfDestroy(void *ptr)
265: {
266:   PetscInt *nf = (PetscInt *)ptr;
267:   PetscErrorCode  ierr;
269:   PetscFree(nf);
270:   return(0);
271: }

273: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
274: {
275:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
276:   IS                 isicol;
277:   PetscErrorCode     ierr;
278:   cudaError_t        cerr;
279:   const PetscInt     *ic,*ai=a->i,*aj=a->j;
280:   PetscScalar        *ba_t;
281:   int                *bi_t;
282:   PetscInt           i,n=A->rmap->n,Nf;
283:   PetscInt           nzBcsr,bwL,bwU;
284:   PetscBool          missing;
285:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
286:   PetscContainer               container;

289:   if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
290:   MatMissingDiagonal(A,&missing,&i);
291:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
292:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
293:   MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);
294:   if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");

296:    // factor: get Nf if available
297:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
298:   if (container) {
299:     PetscInt *pNf=NULL;
300:     PetscContainerGetPointer(container, (void **) &pNf);
301:     Nf = (*pNf)%1000;
302:     PetscContainerCreate(PETSC_COMM_SELF, &container);
303:     PetscMalloc(sizeof(PetscInt), &pNf);
304:     *pNf = Nf;
305:     PetscContainerSetPointer(container, (void *)pNf);
306:     PetscContainerSetUserDestroy(container, MatrixNfDestroy);
307:     PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);
308:     PetscContainerDestroy(&container);
309:   } else Nf = 1;
310:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);

312:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
313:   ISGetIndices(isicol,&ic);

315:   MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
316:   PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);
317:   b    = (Mat_SeqAIJ*)(B)->data;

319:   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
320:   bwL = bwU = 0;
321:   for (int rwb=0; rwb<n; rwb++) {
322:     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
323:     for (int j=0;j<anz;j++) {
324:       PetscInt colb = ic[ajtmp[j]];
325:       if (colb<rwa) { // L
326:         if (rwa-colb > bwL) bwL = rwa-colb;
327:       } else {
328:         if (colb-rwa > bwU) bwU = colb-rwa;
329:       }
330:     }
331:   }
332:   ISRestoreIndices(isicol,&ic);
333:   /* only support structurally symmetric, but it might work */
334:   if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
335:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
336:   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
337:   b->maxnz = b->nz = nzBcsr;
338:   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
339:   PetscInfo2(A,"Matrix Bandwidth = %D, nnz = %D\n",bwL,b->nz);
340:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
341:   cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
342:   cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
343:   cusparseTriFactors->a_band_d = ba_t;
344:   cusparseTriFactors->i_band_d = bi_t;
345:   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
346:   PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));
347:   {
348:     dim3 dimBlockTeam(1,128);
349:     dim3 dimBlockLeague(Nf,1);
350:     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
351:   }
352:   CHECK_LAUNCH_ERROR(); // does a sync

354:   // setup data
355:   if (!cusparseTriFactors->rpermIndices) {
356:     const PetscInt *r;

358:     ISGetIndices(isrow,&r);
359:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
360:     cusparseTriFactors->rpermIndices->assign(r, r+n);
361:     ISRestoreIndices(isrow,&r);
362:     PetscLogCpuToGpu(n*sizeof(PetscInt));
363:   }
364:   /* upper triangular indices */
365:   if (!cusparseTriFactors->cpermIndices) {
366:     const PetscInt *c;

368:     ISGetIndices(isicol,&c);
369:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
370:     cusparseTriFactors->cpermIndices->assign(c, c+n);
371:     ISRestoreIndices(isicol,&c);
372:     PetscLogCpuToGpu(n*sizeof(PetscInt));
373:   }

375:   /* put together the new matrix */
376:   b->free_a       = PETSC_FALSE;
377:   b->free_ij      = PETSC_FALSE;
378:   b->singlemalloc = PETSC_FALSE;
379:   b->ilen = NULL;
380:   b->imax = NULL;
381:   b->row  = isrow;
382:   b->col  = iscol;
383:   PetscObjectReference((PetscObject)isrow);
384:   PetscObjectReference((PetscObject)iscol);
385:   b->icol = isicol;
386:   PetscMalloc1(n+1,&b->solve_work);

388:   B->factortype            = MAT_FACTOR_LU;
389:   B->info.factor_mallocs   = 0;
390:   B->info.fill_ratio_given = 0;

392:   if (ai[n]) {
393:     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
394:   } else {
395:     B->info.fill_ratio_needed = 0.0;
396:   }
397: #if defined(PETSC_USE_INFO)
398:   if (ai[n] != 0) {
399:     PetscReal af = B->info.fill_ratio_needed;
400:     PetscInfo1(A,"Band fill ratio %g\n",(double)af);
401:   } else {
402:     PetscInfo(A,"Empty matrix\n");
403:   }
404: #endif
405:   if (a->inode.size) {
406:     PetscInfo(A,"Warning: using inodes in band solver.\n");
407:   }
408:   MatSeqAIJCheckInode_FactorLU(B);
409:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
410:   B->offloadmask = PETSC_OFFLOAD_GPU;

412:   return(0);
413: }

415: /* Use -pc_factor_mat_solver_type cusparseband */
416: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
417: {
419:   *type = MATSOLVERCUSPARSEBAND;
420:   return(0);
421: }

423: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
424: {
426:   PetscInt       n = A->rmap->n;

429:   MatCreate(PetscObjectComm((PetscObject)A),B);
430:   MatSetSizes(*B,n,n,n,n);
431:   (*B)->factortype = ftype;
432:   (*B)->canuseordering = PETSC_TRUE;
433:   MatSetType(*B,MATSEQAIJCUSPARSE);

435:   if (ftype == MAT_FACTOR_LU) {
436:     MatSetBlockSizesFromMats(*B,A,A);
437:     (*B)->ops->ilufactorsymbolic = NULL; // MatILUFactorSymbolic_SeqAIJCUSPARSE;
438:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
439:     PetscStrallocpy(MATORDERINGRCM,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
440:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");

442:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
443:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);
444:   return(0);
445: }

447: #define WARP_SIZE 32
448: template <typename T>
449: __forceinline__ __device__
450: T wreduce(T a)
451: {
452:   T b;
453:   #pragma unroll
454:   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
455:     b = __shfl_down_sync(0xffffffff, a, i);
456:     a += b;
457:   }
458:   return a;
459: }
460: // reduce in a block, returns result in thread 0
461: template <typename T, int BLOCK_SIZE>
462: __device__
463: T breduce(T a)
464: {
465:   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
466:   __shared__ double buf[NWARP];
467:   int wid = threadIdx.x / WARP_SIZE;
468:   int laneid = threadIdx.x % WARP_SIZE;
469:   T b = wreduce<T>(a);
470:   if (laneid == 0)
471:     buf[wid] = b;
472:   __syncthreads();
473:   if (wid == 0) {
474:     if (threadIdx.x < NWARP)
475:       a = buf[threadIdx.x];
476:     else
477:       a = 0;
478:     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
479:       a += __shfl_down_sync(0xffffffff, a, i);
480:     }
481:   }
482:   return a;
483: }

485: // Band LU kernel ---  ba_csr bi_csr
486: template <int BLOCK_SIZE>
487: __global__
488: void __launch_bounds__(256,1)
489: mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
490: {
491:   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
492:   const PetscScalar *pLi;
493:   const int tid = threadIdx.x;

495:   /* Next, solve L */
496:   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
497:   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
498:     const PetscInt col = locDD<bw ? start : (glbDD-bw);
499:     PetscScalar t = 0;
500:     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
501:       t += pLi[idx]*x[j];
502:     }
503: #if defined(PETSC_USE_COMPLEX)
504:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
505:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
506:     t = tt;
507: #else
508:     t = breduce<PetscReal,BLOCK_SIZE>(t);
509: #endif
510:     if (threadIdx.x == 0)
511:       x[glbDD] -= t; // /1.0
512:     __syncthreads();
513:     // inc
514:     pLi += glbDD-col; // get to diagonal
515:     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
516:     else pLi += bw;
517:     pLi += 1; // skip to next row
518:     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
519:   }
520:   /* Then, solve U */
521:   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
522:   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row

524:   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
525:     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
526:     PetscScalar t = 0;
527:     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
528:       t += pLi[-idx]*x[j];
529:     }
530: #if defined(PETSC_USE_COMPLEX)
531:     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
532:     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
533:     t = tt;
534: #else
535:     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
536: #endif
537:     pLi -= col-glbDD; // diagonal
538:     if (threadIdx.x == 0) {
539:       x[glbDD] -= t;
540:       x[glbDD] /= pLi[0];
541:     }
542:     __syncthreads();
543:     // inc past L to start of previous U
544:     pLi -= bw+1;
545:     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
546:     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
547:   }
548: }

550: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
551: {
552:   const PetscScalar                     *barray;
553:   PetscScalar                           *xarray;
554:   thrust::device_ptr<const PetscScalar> bGPU;
555:   thrust::device_ptr<PetscScalar>       xGPU;
556:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
557:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
558:   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, Nf;
559:   PetscInt                              bw = (int)(2.*(double)n-1.-(double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)nz))+PETSC_MACHINE_EPSILON))/2; // quadric formula for bandwidth
560:   PetscErrorCode                        ierr;
561:   cudaError_t                           cerr;
562:   PetscContainer                        container;

565:   if (A->rmap->n == 0) {
566:     return(0);
567:   }
568:   // factor: get Nf if available
569:   PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
570:   if (container) {
571:     PetscInt *pNf=NULL;
572:     PetscContainerGetPointer(container, (void **) &pNf);
573:     Nf = (*pNf)%1000;
574:   } else Nf = 1;
575:   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n(%D) % Nf(%D) != 0",n,Nf);

577:   /* Get the GPU pointers */
578:   VecCUDAGetArrayWrite(xx,&xarray);
579:   VecCUDAGetArrayRead(bb,&barray);
580:   xGPU = thrust::device_pointer_cast(xarray);
581:   bGPU = thrust::device_pointer_cast(barray);

583:   PetscLogGpuTimeBegin();
584:   /* First, reorder with the row permutation */
585:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
586:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
587:                tempGPU->begin());
588:   constexpr int block = 128;
589:   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
590:   CHECK_LAUNCH_ERROR(); // does a sync

592:   /* Last, reorder with the column permutation */
593:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
594:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
595:                xGPU);

597:   VecCUDARestoreArrayRead(bb,&barray);
598:   VecCUDARestoreArrayWrite(xx,&xarray);
599:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
600:   PetscLogGpuTimeEnd();
601:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
602:   return(0);
603: }