Actual source code: ex52_integrateElement_coef.cu
petsc-3.4.5 2014-06-29
1: #include <petscsys.h>
3: #include "ex52_gpu.h"
5: #if (SPATIAL_DIM_0 == 2)
7: __device__ vecType f1_laplacian_coef(realType u[], vecType gradU[], realType kappa, int comp)
8: {
9: vecType l = {kappa * gradU[comp].x, kappa * gradU[comp].y};
10: return l;
11: }
13: __device__ vecType f1_elasticity_coef(realType u[], vecType gradU[], realType kappa, int comp)
14: {
15: vecType f1;
17: switch (comp) {
18: case 0:
19: f1.x = 0.5*(gradU[0].x + gradU[0].x);
20: f1.y = 0.5*(gradU[0].y + gradU[1].x);
21: break;
22: case 1:
23: f1.x = 0.5*(gradU[1].x + gradU[0].y);
24: f1.y = 0.5*(gradU[1].y + gradU[1].y);
25: }
26: return f1;
27: }
29: #elif (SPATIAL_DIM_0 == 3)
31: __device__ vecType f1_laplacian_coef(realType u[], vecType gradU[], realType kappa, int comp)
32: {
33: vecType l = {kappa*gradU[comp].x, kappa*gradU[comp].y, kappa*gradU[comp].z};
34: return l;
35: }
37: __device__ vecType f1_elasticity_coef(realType u[], vecType gradU[], int comp)
38: {
39: vecType f1;
41: switch (comp) {
42: case 0:
43: f1.x = 0.5*(gradU[0].x + gradU[0].x);
44: f1.y = 0.5*(gradU[0].y + gradU[1].x);
45: f1.z = 0.5*(gradU[0].z + gradU[2].x);
46: break;
47: case 1:
48: f1.x = 0.5*(gradU[1].x + gradU[0].y);
49: f1.y = 0.5*(gradU[1].y + gradU[1].y);
50: f1.z = 0.5*(gradU[1].z + gradU[2].y);
51: break;
52: case 2:
53: f1.x = 0.5*(gradU[2].x + gradU[0].z);
54: f1.y = 0.5*(gradU[2].y + gradU[1].z);
55: f1.z = 0.5*(gradU[2].z + gradU[2].z);
56: }
57: return f1;
58: }
60: #else
62: #error "Invalid spatial dimension"
64: #endif
66: // dim Number of spatial dimensions: 2
67: // N_b Number of basis functions: generated
68: // N_{bt} Number of total basis functions: N_b * N_{comp}
69: // N_q Number of quadrature points: generated
70: // N_{bs} Number of block cells LCM(N_b, N_q)
71: // N_{bst} Number of block cell components LCM(N_{bt}, N_q)
72: // N_{bl} Number of concurrent blocks generated
73: // N_t Number of threads: N_{bl} * N_{bs}
74: // N_{cbc} Number of concurrent basis cells: N_{bl} * N_q
75: // N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b
76: // N_{sbc} Number of serial basis cells: N_{bs} / N_q
77: // N_{sqc} Number of serial quadrature cells: N_{bs} / N_b
78: // N_{cb} Number of serial cell batches: input
79: // N_c Number of total cells: N_{cb}*N_{t}/N_{comp}
81: __global__ void integrateElementCoefQuadrature(int N_cb, realType *coefficients, realType *physCoefficients, realType *jacobianInverses, realType *jacobianDeterminants, realType *elemVec)
82: {
83: #include "ex52_gpu_inline.h"
84: const int dim = SPATIAL_DIM_0;
85: const int N_b = numBasisFunctions_0; // The number of basis functions
86: const int N_comp = numBasisComponents_0; // The number of basis function components
87: const int N_bt = N_b*N_comp; // The total number of scalar basis functions
88: const int N_q = numQuadraturePoints_0; // The number of quadrature points
89: const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously
90: const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl
91: const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)
92: const int N_c = N_cb * N_bc;
93: const int N_sbc = N_bst / (N_q * N_comp);
94: const int N_sqc = N_bst / N_bt;
95: /* Calculated indices */
96: const int tidx = threadIdx.x + blockDim.x*threadIdx.y;
97: const int blidx = tidx / N_bst; // Block number for this thread
98: const int bidx = tidx % N_bt; // Basis function mapped to this thread
99: const int cidx = tidx % N_comp; // Basis component mapped to this thread
100: const int qidx = tidx % N_q; // Quadrature point mapped to this thread
101: const int blbidx = tidx % N_q + blidx*N_q; // Cell mapped to this thread in the basis phase
102: const int blqidx = tidx % N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase
103: const int gidx = blockIdx.y*gridDim.x + blockIdx.x;
104: const int Goffset = gidx*N_c;
105: const int Coffset = gidx*N_c*N_bt;
106: const int Poffset = gidx*N_c*N_q;
107: const int Eoffset = gidx*N_c*N_bt;
108: /* Quadrature data */
109: realType w; // $w_q$, Quadrature weight at $x_q$
110: //__shared__ realType phi_i[N_bt*N_q]; // $\phi_i(x_q)$, Value of the basis function $i$ at $x_q$
111: __shared__ vecType phiDer_i[N_bt*N_q]; // $\frac{\partial\phi_i(x_q)}{\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$
112: /* Geometric data */
113: __shared__ realType detJ[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$
114: __shared__ realType invJ[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$
115: /* FEM data */
116: __shared__ realType u_i[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\mathcal{T}} = \sum_i u_i \phi_i$
117: /* Physical coefficient data */
118: realType kappa; // Physical coefficient $\kappa$ in the equation
119: /* Intermediate calculations */
120: //__shared__ realType f_0[N_t*N_sqc]; // $f_0(u(x_q), \nabla u(x_q)) |J(x_q)| w_q$
121: __shared__ vecType f_1[N_t*N_sqc]; // $f_1(u(x_q), \nabla u(x_q)) |J(x_q)| w_q$
122: /* Output data */
123: realType e_i; // Coefficient $e_i$ of the residual
125: /* These should be generated inline */
126: /* Load quadrature weights */
127: w = weights_0[qidx];
128: /* Load basis tabulation \phi_i for this cell */
129: if (tidx < N_bt*N_q) {
130: // phi_i[tidx] = Basis_0[tidx];
131: phiDer_i[tidx] = BasisDerivatives_0[tidx];
132: }
134: for (int batch = 0; batch < N_cb; ++batch) {
135: /* Load geometry */
136: detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];
137: for (int n = 0; n < dim*dim; ++n) {
138: const int offset = n*N_t;
139: invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];
140: }
141: /* Load coefficients u_i for this cell */
142: for (int n = 0; n < N_bt; ++n) {
143: const int offset = n*N_t;
144: u_i[offset+tidx] = coefficients[Coffset+batch*N_t*N_b+offset+tidx];
145: }
146: /* Load physical coefficient for this cell */
147: kappa = physCoefficients[Poffset+batch*N_t*N_q+tidx];
149: /* Map coefficients to values at quadrature points */
150: for (int c = 0; c < N_sqc; ++c) {
151: realType u[N_comp]; // $u(x_q)$, Value of the field at $x_q$
152: vecType gradU[N_comp]; // $\nabla u(x_q)$, Value of the field gradient at $x_q$
153: // vecType x = {0.0, 0.0}; // Quadrature point $x_q$
154: const int cell = c*N_bl*N_b + blqidx;
155: const int fidx = (cell*N_q + qidx)*N_comp + cidx;
157: for (int comp = 0; comp < N_comp; ++comp) {
158: //u[comp] = 0.0;
159: #if SPATIAL_DIM_0 == 2
160: gradU[comp].x = 0.0; gradU[comp].y = 0.0;
161: #elif SPATIAL_DIM_0 == 3
162: gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;
163: #endif
164: }
165: /* Get field and derivatives at this quadrature point */
166: for (int i = 0; i < N_b; ++i) {
167: for (int comp = 0; comp < N_comp; ++comp) {
168: const int b = i*N_comp+comp;
169: const int pidx = qidx*N_bt + b;
170: const int uidx = cell*N_bt + b;
171: vecType realSpaceDer;
173: // u[comp] += u_i[uidx]*phi_i[qidx*N_bt+bbidx];
174: #if SPATIAL_DIM_0 == 2
175: realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;
176: gradU[comp].x += u_i[uidx]*realSpaceDer.x;
177: realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;
178: gradU[comp].y += u_i[uidx]*realSpaceDer.y;
179: #elif SPATIAL_DIM_0 == 3
180: realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;
181: gradU[comp].x += u_i[uidx]*realSpaceDer.x;
182: realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;
183: gradU[comp].y += u_i[uidx]*realSpaceDer.y;
184: realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;
185: gradU[comp].z += u_i[uidx]*realSpaceDer.z;
186: #endif
187: }
188: }
189: /* Process values at quadrature points */
190: f_1[fidx] = f1_coef_func(u, gradU, kappa, cidx);
191: #if SPATIAL_DIM_0 == 2
192: f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;
193: #elif SPATIAL_DIM_0 == 3
194: f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;
195: #endif
196: }
198: /* ==== TRANSPOSE THREADS ==== */
199: __syncthreads();
201: /* Map values at quadrature points to coefficients */
202: for (int c = 0; c < N_sbc; ++c) {
203: const int cell = c*N_bl*N_q + blbidx;
205: e_i = 0.0;
206: for (int q = 0; q < N_q; ++q) {
207: const int pidx = q*N_bt + bidx;
208: const int fidx = (cell*N_q + q)*N_comp + cidx;
209: vecType realSpaceDer;
211: // e_i += phi_i[pidx]*f_0[fidx];
212: #if SPATIAL_DIM_0 == 2
213: realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;
214: e_i += realSpaceDer.x*f_1[fidx].x;
215: realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;
216: e_i += realSpaceDer.y*f_1[fidx].y;
217: #elif SPATIAL_DIM_0 == 3
218: realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;
219: e_i += realSpaceDer.x*f_1[fidx].x;
220: realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;
221: e_i += realSpaceDer.y*f_1[fidx].y;
222: realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;
223: e_i += realSpaceDer.z*f_1[fidx].z;
224: #endif
225: }
226: #if 0
227: // Check f_1
228: {
229: const int q = 0;
230: const int i = bidx/N_comp;
231: // Prints f1[0].x, f1[1].x, f1[0].y, f1[1].y
232: switch (i) {
233: case 0:
234: e_i = f_1[(cell*N_q+q)*N_comp+cidx].x;break;
235: case 1:
236: e_i = f_1[(cell*N_q+q)*N_comp+cidx].y;break;
237: //case 2:
238: //e_i = f_1[(cell*N_q+q)*N_comp+cidx].z;break;
239: default:
240: e_i = 0.0;
241: }
242: }
243: // Check that u_i is being used correctly
244: //e_i = u_i[cell*N_bt+bidx];
245: e_i = detJ[cell];
246: //e_i = coefficients[Coffset+(batch*N_sbc+c)*N_t+tidx];
247: //e_i = Coffset+(batch*N_sbc+c)*N_t+tidx;
248: //e_i = cell*N_bt+bidx;
249: #endif
250: /* Write element vector for N_{cbc} cells at a time */
251: elemVec[Eoffset+(batch*N_sbc+c)*N_t+tidx] = e_i;
252: }
253: /* ==== Could do one write per batch ==== */
254: }
255: return;
256: }
258: // Calculate a conforming thread grid for N kernels
261: PetscErrorCode calculateGridCoef(const int N, const int blockSize, unsigned int& x, unsigned int& y, unsigned int& z)
262: {
264: z = 1;
265: if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
266: const int Nblocks = N/blockSize;
267: for (x = (int) (sqrt(Nblocks) + 0.5); x > 0; --x) {
268: y = Nblocks/x;
269: if (x*y == Nblocks) break;
270: }
271: if (x*y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
272: return(0);
273: }
277: /*
278: IntegrateElementCoefBatchGPU - Produces element vectors from input element solution and geometric information via quadrature
280: Input Parameters:
281: + Ne - The total number of cells, Nchunk * Ncb * Nbc
282: . Ncb - The number of serial cell batches
283: . Nbc - The number of cells per batch
284: . Nbl - The number of concurrent cells blocks per thread block
285: . coefficients - An array of the solution vector for each cell
286: . physCoefficients - An array of the physical coefficient values at quadrature points for each cell
287: . jacobianInverses - An array of the inverse Jacobian for each cell
288: . jacobianDeterminants - An array of the Jacobian determinant for each cell
289: . event - A PetscEvent, used to log flops
290: - debug - A flag for debugging information
292: Output Parameter:
293: . elemVec - An array of the element vectors for each cell
294: */
295: PETSC_EXTERN PetscErrorCode IntegrateElementCoefBatchGPU(PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt Nbl, const PetscScalar coefficients[], const PetscScalar physCoefficients[],
296: const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[],
297: PetscLogEvent event, PetscInt debug)
298: {
299: #include "ex52_gpu_inline.h"
300: const int dim = SPATIAL_DIM_0;
301: const int N_b = numBasisFunctions_0; // The number of basis functions
302: const int N_comp = numBasisComponents_0; // The number of basis function components
303: const int N_bt = N_b*N_comp; // The total number of scalar basis functions
304: const int N_q = numQuadraturePoints_0; // The number of quadrature points
305: const int N_bst = N_bt*N_q; // The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously
306: const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl
308: realType *d_coefficients;
309: realType *d_physCoefficients;
310: realType *d_jacobianInverses;
311: realType *d_jacobianDeterminants;
312: realType *d_elemVec;
316: if (Nbl != N_bl) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inconsisten block size %d should be %d", Nbl, N_bl);
317: if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp);
318: if (!Ne) {
319: PetscStageLog stageLog;
320: PetscEventPerfLog eventLog = NULL;
321: PetscInt stage;
323: PetscLogGetStageLog(&stageLog);
324: PetscStageLogGetCurrent(stageLog, &stage);
325: PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
326: /* Log performance info */
327: eventLog->eventInfo[event].count++;
328: eventLog->eventInfo[event].time += 0.0;
329: eventLog->eventInfo[event].flops += 0;
330: return(0);
331: }
332: // Marshalling
333: cudaMalloc((void**) &d_coefficients, Ne*N_bt * sizeof(realType));
334: cudaMalloc((void**) &d_physCoefficients, Ne*N_bt * sizeof(realType));
335: cudaMalloc((void**) &d_jacobianInverses, Ne*dim*dim * sizeof(realType));
336: cudaMalloc((void**) &d_jacobianDeterminants, Ne * sizeof(realType));
337: cudaMalloc((void**) &d_elemVec, Ne*N_bt * sizeof(realType));
338: if (sizeof(PetscReal) == sizeof(realType)) {
339: cudaMemcpy(d_coefficients, coefficients, Ne*N_bt * sizeof(realType), cudaMemcpyHostToDevice);
340: cudaMemcpy(d_physCoefficients, physCoefficients, Ne*N_q * sizeof(realType), cudaMemcpyHostToDevice);
341: cudaMemcpy(d_jacobianInverses, jacobianInverses, Ne*dim*dim * sizeof(realType), cudaMemcpyHostToDevice);
342: cudaMemcpy(d_jacobianDeterminants, jacobianDeterminants, Ne * sizeof(realType), cudaMemcpyHostToDevice);
343: } else {
344: realType *c, *pc, *jI, *jD;
345: PetscInt i;
347: PetscMalloc4(Ne*N_bt,realType,&c,Ne*N_q,realType,&pc,Ne*dim*dim,realType,&jI,Ne,realType,&jD);
348: for (i = 0; i < Ne*N_bt; ++i) c[i] = coefficients[i];
349: for (i = 0; i < Ne*N_q; ++i) pc[i] = physCoefficients[i];
350: for (i = 0; i < Ne*dim*dim; ++i) jI[i] = jacobianInverses[i];
351: for (i = 0; i < Ne; ++i) jD[i] = jacobianDeterminants[i];
352: cudaMemcpy(d_coefficients, c, Ne*N_bt * sizeof(realType), cudaMemcpyHostToDevice);
353: cudaMemcpy(d_physCoefficients, pc, Ne*N_q * sizeof(realType), cudaMemcpyHostToDevice);
354: cudaMemcpy(d_jacobianInverses, jI, Ne*dim*dim * sizeof(realType), cudaMemcpyHostToDevice);
355: cudaMemcpy(d_jacobianDeterminants, jD, Ne * sizeof(realType), cudaMemcpyHostToDevice);
356: PetscFree4(c,pc,jI,jD);
357: }
358: // Kernel launch
359: unsigned int x, y, z;
360: calculateGridCoef(Ne, Ncb*Nbc, x, y, z);
361: dim3 grid(x, y, z);
362: dim3 block(Nbc * N_comp, 1, 1);
363: cudaEvent_t start, stop;
364: float msElapsedTime;
366: cudaEventCreate(&start);
367: cudaEventCreate(&stop);
368: // if (debug) {
369: PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n",
370: grid.x, grid.y, grid.z, block.x, block.y, block.z, Ncb);
371: PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb);
372: // }
373: cudaEventRecord(start, 0);
374: integrateElementCoefQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_physCoefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec);
375: cudaEventRecord(stop, 0);
376: cudaEventSynchronize(stop);
377: cudaEventElapsedTime(&msElapsedTime, start, stop);
378: cudaEventDestroy(start);
379: cudaEventDestroy(stop);
380: // Marshalling
381: if (sizeof(PetscReal) == sizeof(realType)) {
382: cudaMemcpy(elemVec, d_elemVec, Ne*N_bt * sizeof(realType), cudaMemcpyDeviceToHost);
383: } else {
384: realType *eV;
385: PetscInt i;
387: PetscMalloc(Ne*N_bt * sizeof(realType), &eV);
388: cudaMemcpy(eV, d_elemVec, Ne*N_bt * sizeof(realType), cudaMemcpyDeviceToHost);
389: for (i = 0; i < Ne*N_bt; ++i) elemVec[i] = eV[i];
390: PetscFree(eV);
391: }
392: cudaFree(d_coefficients);
393: cudaFree(d_jacobianInverses);
394: cudaFree(d_jacobianDeterminants);
395: cudaFree(d_elemVec);
396: {
397: PetscStageLog stageLog;
398: PetscEventPerfLog eventLog = NULL;
399: PetscInt stage;
401: PetscLogGetStageLog(&stageLog);
402: PetscStageLogGetCurrent(stageLog, &stage);
403: PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
404: /* Log performance info */
405: eventLog->eventInfo[event].count++;
406: eventLog->eventInfo[event].time += msElapsedTime*1.0e-3;
407: eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne;
408: }
409: return(0);
410: }