Actual source code: ex52_integrateElement.cu
petsc-3.4.5 2014-06-29
  1: #include <petscsys.h>

  3: #include "ex52_gpu.h"

  5: __device__ vecType f1_laplacian(realType u[], vecType gradU[], int comp)
  6: {
  7:   return gradU[comp];
  8: }

 10: #if (SPATIAL_DIM_0 == 2)

 12: __device__ vecType f1_elasticity(realType u[], vecType gradU[], int comp)
 13: {
 14:   vecType f1;

 16:   switch (comp) {
 17:   case 0:
 18:     f1.x = 0.5*(gradU[0].x + gradU[0].x);
 19:     f1.y = 0.5*(gradU[0].y + gradU[1].x);
 20:     break;
 21:   case 1:
 22:     f1.x = 0.5*(gradU[1].x + gradU[0].y);
 23:     f1.y = 0.5*(gradU[1].y + gradU[1].y);
 24:   }
 25:   return f1;
 26: }

 28: #elif (SPATIAL_DIM_0 == 3)

 30: __device__ vecType f1_elasticity(realType u[], vecType gradU[], int comp)
 31: {
 32:   vecType f1;

 34:   switch (comp) {
 35:   case 0:
 36:     f1.x = 0.5*(gradU[0].x + gradU[0].x);
 37:     f1.y = 0.5*(gradU[0].y + gradU[1].x);
 38:     f1.z = 0.5*(gradU[0].z + gradU[2].x);
 39:     break;
 40:   case 1:
 41:     f1.x = 0.5*(gradU[1].x + gradU[0].y);
 42:     f1.y = 0.5*(gradU[1].y + gradU[1].y);
 43:     f1.z = 0.5*(gradU[1].z + gradU[2].y);
 44:     break;
 45:   case 2:
 46:     f1.x = 0.5*(gradU[2].x + gradU[0].z);
 47:     f1.y = 0.5*(gradU[2].y + gradU[1].z);
 48:     f1.z = 0.5*(gradU[2].z + gradU[2].z);
 49:   }
 50:   return f1;
 51: }

 53: #else

 55: #error "Invalid spatial dimension"

 57: #endif

 59: // dim     Number of spatial dimensions:          2
 60: // N_b     Number of basis functions:             generated
 61: // N_{bt}  Number of total basis functions:       N_b * N_{comp}
 62: // N_q     Number of quadrature points:           generated
 63: // N_{bs}  Number of block cells                  LCM(N_b, N_q)
 64: // N_{bst} Number of block cell components        LCM(N_{bt}, N_q)
 65: // N_{bl}  Number of concurrent blocks            generated
 66: // N_t     Number of threads:                     N_{bl} * N_{bs}
 67: // N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q
 68: // N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b
 69: // N_{sbc} Number of serial     basis      cells: N_{bs} / N_q
 70: // N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b
 71: // N_{cb}  Number of serial cell batches:         input
 72: // N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp}

 74: __global__ void integrateElementQuadrature(int N_cb, realType *coefficients, realType *jacobianInverses, realType *jacobianDeterminants, realType *elemVec)
 75: {
 76:   #include "ex52_gpu_inline.h"
 77:   const int dim    = SPATIAL_DIM_0;
 78:   const int N_b    = numBasisFunctions_0;           // The number of basis functions
 79:   const int N_comp = numBasisComponents_0;          // The number of basis function components
 80:   const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions
 81:   const int N_q    = numQuadraturePoints_0;         // The number of quadrature points
 82:   const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously
 83:   const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl
 84:   const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)
 85:   const int N_c    = N_cb * N_bc;
 86:   const int N_sbc  = N_bst / (N_q * N_comp);
 87:   const int N_sqc  = N_bst / N_bt;

 89:   /* Calculated indices */
 90:   const int tidx    = threadIdx.x + blockDim.x*threadIdx.y;
 91:   const int blidx   = tidx / N_bst;                  // Block number for this thread
 92:   const int bidx    = tidx % N_bt;                   // Basis function mapped to this thread
 93:   const int cidx    = tidx % N_comp;                 // Basis component mapped to this thread
 94:   const int qidx    = tidx % N_q;                    // Quadrature point mapped to this thread
 95:   const int blbidx  = tidx % N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase
 96:   const int blqidx  = tidx % N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase
 97:   const int gidx    = blockIdx.y*gridDim.x + blockIdx.x;
 98:   const int Goffset = gidx*N_c;
 99:   const int Coffset = gidx*N_c*N_bt;
100:   const int Eoffset = gidx*N_c*N_bt;

102:   /* Quadrature data */
103:   realType             w;                   // $w_q$, Quadrature weight at $x_q$
104: //__shared__ realType  phi_i[N_bt*N_q];     // $\phi_i(x_q)$, Value of the basis function $i$ at $x_q$
105:   __shared__ vecType   phiDer_i[N_bt*N_q];  // $\frac{\partial\phi_i(x_q)}{\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$
106:   /* Geometric data */
107:   __shared__ realType  detJ[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$
108:   __shared__ realType  invJ[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$
109:   /* FEM data */
110:   __shared__ realType  u_i[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\mathcal{T}} = \sum_i u_i \phi_i$
111:   /* Intermediate calculations */
112: //__shared__ realType  f_0[N_t*N_sqc];      // $f_0(u(x_q), \nabla u(x_q)) |J(x_q)| w_q$
113:   __shared__ vecType   f_1[N_t*N_sqc];      // $f_1(u(x_q), \nabla u(x_q)) |J(x_q)| w_q$
114:   /* Output data */
115:   realType             e_i;                 // Coefficient $e_i$ of the residual

117:   /* These should be generated inline */
118:   /* Load quadrature weights */
119:   w = weights_0[qidx];
120:   /* Load basis tabulation \phi_i for this cell */
121:   if (tidx < N_bt*N_q) {
122:  // phi_i[tidx]    = Basis_0[tidx];
123:     phiDer_i[tidx] = BasisDerivatives_0[tidx];
124:   }

126:   for (int batch = 0; batch < N_cb; ++batch) {
127:     /* Load geometry */
128:     detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];
129:     for (int n = 0; n < dim*dim; ++n) {
130:       const int offset = n*N_t;
131:       invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];
132:     }
133:     /* Load coefficients u_i for this cell */
134:     for (int n = 0; n < N_bt; ++n) {
135:       const int offset = n*N_t;
136:       u_i[offset+tidx] = coefficients[Coffset+batch*N_t*N_b+offset+tidx];
137:     }

139:     /* Map coefficients to values at quadrature points */
140:     for (int c = 0; c < N_sqc; ++c) {
141:       realType  u[N_comp];     // $u(x_q)$, Value of the field at $x_q$
142:       vecType   gradU[N_comp]; // $\nabla u(x_q)$, Value of the field gradient at $x_q$
143:    // vecType   x             = {0.0, 0.0};           // Quadrature point $x_q$
144:       const int cell          = c*N_bl*N_b + blqidx;
145:       const int fidx          = (cell*N_q + qidx)*N_comp + cidx;

147:       for (int comp = 0; comp < N_comp; ++comp) {
148:         //u[comp] = 0.0;
149: #if SPATIAL_DIM_0 == 2
150:         gradU[comp].x = 0.0; gradU[comp].y = 0.0;
151: #elif  SPATIAL_DIM_0 == 3
152:         gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;
153: #endif
154:       }
155:       /* Get field and derivatives at this quadrature point */
156:       for (int i = 0; i < N_b; ++i) {
157:         for (int comp = 0; comp < N_comp; ++comp) {
158:           const int b    = i*N_comp+comp;
159:           const int pidx = qidx*N_bt + b;
160:           const int uidx = cell*N_bt + b;
161:           vecType   realSpaceDer;

163:           // u[comp] += u_i[uidx]*phi_i[qidx*N_bt+bbidx];
164: #if SPATIAL_DIM_0 == 2
165:           realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;
166:           gradU[comp].x += u_i[uidx]*realSpaceDer.x;
167:           realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;
168:           gradU[comp].y += u_i[uidx]*realSpaceDer.y;
169: #elif  SPATIAL_DIM_0 == 3
170:           realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;
171:           gradU[comp].x += u_i[uidx]*realSpaceDer.x;
172:           realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;
173:           gradU[comp].y += u_i[uidx]*realSpaceDer.y;
174:           realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;
175:           gradU[comp].z += u_i[uidx]*realSpaceDer.z;
176: #endif
177:         }
178:       }
179:       /* Process values at quadrature points */
180:       f_1[fidx] = f1_func(u, gradU, cidx);
181: #if SPATIAL_DIM_0 == 2
182:       f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;
183: #elif  SPATIAL_DIM_0 == 3
184:       f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;
185: #endif
186:     }

188:     /* ==== TRANSPOSE THREADS ==== */
189:     __syncthreads();

191:     /* Map values at quadrature points to coefficients */
192:     for (int c = 0; c < N_sbc; ++c) {
193:       const int cell = c*N_bl*N_q + blbidx;

195:       e_i = 0.0;
196:       for (int q = 0; q < N_q; ++q) {
197:         const int pidx = q*N_bt + bidx;
198:         const int fidx = (cell*N_q + q)*N_comp + cidx;
199:         vecType   realSpaceDer;

201:         // e_i += phi_i[pidx]*f_0[fidx];
202: #if SPATIAL_DIM_0 == 2
203:         realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;
204:         e_i           += realSpaceDer.x*f_1[fidx].x;
205:         realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;
206:         e_i           += realSpaceDer.y*f_1[fidx].y;
207: #elif  SPATIAL_DIM_0 == 3
208:         realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;
209:         e_i           += realSpaceDer.x*f_1[fidx].x;
210:         realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;
211:         e_i           += realSpaceDer.y*f_1[fidx].y;
212:         realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;
213:         e_i           += realSpaceDer.z*f_1[fidx].z;
214: #endif
215:       }
216: #if 0
217:       // Check f_1
218:       {
219:         const int q = 0;
220:         const int i = bidx/N_comp;
221:         // Prints f1[0].x, f1[1].x, f1[0].y, f1[1].y
222:         switch (i) {
223:         case 0:
224:           e_i = f_1[(cell*N_q+q)*N_comp+cidx].x;break;
225:         case 1:
226:           e_i = f_1[(cell*N_q+q)*N_comp+cidx].y;break;
227:         //case 2:
228:         //e_i = f_1[(cell*N_q+q)*N_comp+cidx].z;break;
229:         default:
230:           e_i = 0.0;
231:         }
232:       }
233:       // Check that u_i is being used correctly
234:       //e_i = u_i[cell*N_bt+bidx];
235:       e_i = detJ[cell];
236:       //e_i = coefficients[Coffset+(batch*N_sbc+c)*N_t+tidx];
237:       //e_i = Coffset+(batch*N_sbc+c)*N_t+tidx;
238:       //e_i = cell*N_bt+bidx;
239: #endif
240:       /* Write element vector for N_{cbc} cells at a time */
241:       elemVec[Eoffset+(batch*N_sbc+c)*N_t+tidx] = e_i;
242:     }
243:     /* ==== Could do one write per batch ==== */
244:   }
245:   return;
246: }

248: __global__ void integrateLaplacianJacobianQuadrature()
249: {
250:   /* Map coefficients to values at quadrature points */
251:   /* Process values at quadrature points */
252:   /* Map values at quadrature points to coefficients */
253:   return;
254: }

256: // Calculate a conforming thread grid for N kernels
259: PetscErrorCode calculateGrid(const int N, const int blockSize, unsigned int& x, unsigned int& y, unsigned int& z)
260: {
262:   z = 1;
263:   if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
264:   const int Nblocks = N/blockSize;
265:   for (x = (int) (sqrt(Nblocks) + 0.5); x > 0; --x) {
266:     y = Nblocks/x;
267:     if (x*y == Nblocks) break;
268:   }
269:   if (x*y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
270:   return(0);
271: }

275: /*
276:   IntegrateElementBatchGPU - Produces element vectors from input element solution and geometric information via quadrature

278:   Input Parameters:
279: + Ne - The total number of cells, Nchunk * Ncb * Nbc
280: . Ncb - The number of serial cell batches
281: . Nbc - The number of cells per batch
282: . Nbl - The number of concurrent cells blocks per thread block
283: . coefficients - An array of the solution vector for each cell
284: . jacobianInverses - An array of the inverse Jacobian for each cell
285: . jacobianDeterminants - An array of the Jacobian determinant for each cell
286: . event - A PetscEvent, used to log flops
287: - debug - A flag for debugging information

289:   Output Parameter:
290: . elemVec - An array of the element vectors for each cell
291: */
292: PETSC_EXTERN PetscErrorCode IntegrateElementBatchGPU(PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt Nbl, const PetscScalar coefficients[],
293:                                         const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[],
294:                                         PetscLogEvent event, PetscInt debug)
295: {
296:   #include "ex52_gpu_inline.h"
297:   const int dim    = SPATIAL_DIM_0;
298:   const int N_b    = numBasisFunctions_0;   // The number of basis functions
299:   const int N_comp = numBasisComponents_0;  // The number of basis function components
300:   const int N_bt   = N_b*N_comp;            // The total number of scalar basis functions
301:   const int N_q    = numQuadraturePoints_0; // The number of quadrature points
302:   const int N_bst  = N_bt*N_q;              // The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously
303:   const int N_t    = N_bst*N_bl;            // The number of threads, N_bst * N_bl

305:   realType       *d_coefficients;
306:   realType       *d_jacobianInverses;
307:   realType       *d_jacobianDeterminants;
308:   realType       *d_elemVec;

312:   if (Nbl != N_bl) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inconsisten block size %d should be %d", Nbl, N_bl);
313:   if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp);
314:   if (!Ne) {
315:     PetscStageLog     stageLog;
316:     PetscEventPerfLog eventLog = NULL;
317:     PetscInt          stage;

319:     PetscLogGetStageLog(&stageLog);
320:     PetscStageLogGetCurrent(stageLog, &stage);
321:     PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
322:     /* Log performance info */
323:     eventLog->eventInfo[event].count++;
324:     eventLog->eventInfo[event].time  += 0.0;
325:     eventLog->eventInfo[event].flops += 0;
326:     return(0);
327:   }
328:   // Marshalling
329:   cudaMalloc((void**) &d_coefficients,         Ne*N_bt * sizeof(realType));
330:   cudaMalloc((void**) &d_jacobianInverses,     Ne*dim*dim * sizeof(realType));
331:   cudaMalloc((void**) &d_jacobianDeterminants, Ne * sizeof(realType));
332:   cudaMalloc((void**) &d_elemVec,              Ne*N_bt * sizeof(realType));
333:   if (sizeof(PetscReal) == sizeof(realType)) {
334:     cudaMemcpy(d_coefficients,         coefficients,         Ne*N_bt    * sizeof(realType), cudaMemcpyHostToDevice);
335:     cudaMemcpy(d_jacobianInverses,     jacobianInverses,     Ne*dim*dim * sizeof(realType), cudaMemcpyHostToDevice);
336:     cudaMemcpy(d_jacobianDeterminants, jacobianDeterminants, Ne         * sizeof(realType), cudaMemcpyHostToDevice);
337:   } else {
338:     realType *c, *jI, *jD;
339:     PetscInt i;

341:     PetscMalloc3(Ne*N_bt,realType,&c,Ne*dim*dim,realType,&jI,Ne,realType,&jD);
342:     for (i = 0; i < Ne*N_bt;    ++i) c[i]  = coefficients[i];
343:     for (i = 0; i < Ne*dim*dim; ++i) jI[i] = jacobianInverses[i];
344:     for (i = 0; i < Ne;         ++i) jD[i] = jacobianDeterminants[i];
345:     cudaMemcpy(d_coefficients,         c,  Ne*N_bt    * sizeof(realType), cudaMemcpyHostToDevice);
346:     cudaMemcpy(d_jacobianInverses,     jI, Ne*dim*dim * sizeof(realType), cudaMemcpyHostToDevice);
347:     cudaMemcpy(d_jacobianDeterminants, jD, Ne         * sizeof(realType), cudaMemcpyHostToDevice);
348:     PetscFree3(c,jI,jD);
349:   }
350:   // Kernel launch
351:   unsigned int x, y, z;
352:   calculateGrid(Ne, Ncb*Nbc, x, y, z);
353:   dim3 grid(x, y, z);
354:   dim3 block(Nbc*N_comp, 1, 1);
355:   cudaEvent_t start, stop;
356:   float       msElapsedTime;

358:   cudaEventCreate(&start);
359:   cudaEventCreate(&stop);
360:   // if (debug) {
361:   PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n",
362:                      grid.x, grid.y, grid.z, block.x, block.y, block.z, Ncb);
363:   PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb);
364:   // }
365:   cudaEventRecord(start, 0);
366:   integrateElementQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec);
367:   cudaEventRecord(stop, 0);
368:   cudaEventSynchronize(stop);
369:   cudaEventElapsedTime(&msElapsedTime, start, stop);
370:   cudaEventDestroy(start);
371:   cudaEventDestroy(stop);
372:   // Marshalling
373:   if (sizeof(PetscReal) == sizeof(realType)) {
374:     cudaMemcpy(elemVec, d_elemVec, Ne*N_bt * sizeof(realType), cudaMemcpyDeviceToHost);
375:   } else {
376:     realType *eV;
377:     PetscInt i;

379:     PetscMalloc(Ne*N_bt * sizeof(realType), &eV);
380:     cudaMemcpy(eV, d_elemVec, Ne*N_bt * sizeof(realType), cudaMemcpyDeviceToHost);
381:     for (i = 0; i < Ne*N_bt; ++i) elemVec[i] = eV[i];
382:     PetscFree(eV);
383:   }
384:   cudaFree(d_coefficients);
385:   cudaFree(d_jacobianInverses);
386:   cudaFree(d_jacobianDeterminants);
387:   cudaFree(d_elemVec);
388:   {
389:     PetscStageLog     stageLog;
390:     PetscEventPerfLog eventLog = NULL;
391:     PetscInt          stage;

393:     PetscLogGetStageLog(&stageLog);
394:     PetscStageLogGetCurrent(stageLog, &stage);
395:     PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
396:     /* Log performance info */
397:     eventLog->eventInfo[event].count++;
398:     eventLog->eventInfo[event].time  += msElapsedTime*1.0e-3;
399:     eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne;
400:   }
401:   return(0);
402: }