Actual source code: ex52_integrateElement_coef.cu

petsc-3.4.5 2014-06-29
  1: #include <petscsys.h>

  3: #include "ex52_gpu.h"

  5: #if (SPATIAL_DIM_0 == 2)

  7: __device__ vecType f1_laplacian_coef(realType u[], vecType gradU[], realType kappa, int comp)
  8: {
  9:   vecType l = {kappa * gradU[comp].x, kappa * gradU[comp].y};
 10:   return l;
 11: }

 13: __device__ vecType f1_elasticity_coef(realType u[], vecType gradU[], realType kappa, int comp)
 14: {
 15:   vecType f1;

 17:   switch (comp) {
 18:   case 0:
 19:     f1.x = 0.5*(gradU[0].x + gradU[0].x);
 20:     f1.y = 0.5*(gradU[0].y + gradU[1].x);
 21:     break;
 22:   case 1:
 23:     f1.x = 0.5*(gradU[1].x + gradU[0].y);
 24:     f1.y = 0.5*(gradU[1].y + gradU[1].y);
 25:   }
 26:   return f1;
 27: }

 29: #elif (SPATIAL_DIM_0 == 3)

 31: __device__ vecType f1_laplacian_coef(realType u[], vecType gradU[], realType kappa, int comp)
 32: {
 33:   vecType l = {kappa*gradU[comp].x, kappa*gradU[comp].y, kappa*gradU[comp].z};
 34:   return l;
 35: }

 37: __device__ vecType f1_elasticity_coef(realType u[], vecType gradU[], int comp)
 38: {
 39:   vecType f1;

 41:   switch (comp) {
 42:   case 0:
 43:     f1.x = 0.5*(gradU[0].x + gradU[0].x);
 44:     f1.y = 0.5*(gradU[0].y + gradU[1].x);
 45:     f1.z = 0.5*(gradU[0].z + gradU[2].x);
 46:     break;
 47:   case 1:
 48:     f1.x = 0.5*(gradU[1].x + gradU[0].y);
 49:     f1.y = 0.5*(gradU[1].y + gradU[1].y);
 50:     f1.z = 0.5*(gradU[1].z + gradU[2].y);
 51:     break;
 52:   case 2:
 53:     f1.x = 0.5*(gradU[2].x + gradU[0].z);
 54:     f1.y = 0.5*(gradU[2].y + gradU[1].z);
 55:     f1.z = 0.5*(gradU[2].z + gradU[2].z);
 56:   }
 57:   return f1;
 58: }

 60: #else

 62: #error "Invalid spatial dimension"

 64: #endif

 66: // dim     Number of spatial dimensions:          2
 67: // N_b     Number of basis functions:             generated
 68: // N_{bt}  Number of total basis functions:       N_b * N_{comp}
 69: // N_q     Number of quadrature points:           generated
 70: // N_{bs}  Number of block cells                  LCM(N_b, N_q)
 71: // N_{bst} Number of block cell components        LCM(N_{bt}, N_q)
 72: // N_{bl}  Number of concurrent blocks            generated
 73: // N_t     Number of threads:                     N_{bl} * N_{bs}
 74: // N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q
 75: // N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b
 76: // N_{sbc} Number of serial     basis      cells: N_{bs} / N_q
 77: // N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b
 78: // N_{cb}  Number of serial cell batches:         input
 79: // N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp}

 81: __global__ void integrateElementCoefQuadrature(int N_cb, realType *coefficients, realType *physCoefficients, realType *jacobianInverses, realType *jacobianDeterminants, realType *elemVec)
 82: {
 83:   #include "ex52_gpu_inline.h"
 84:   const int dim    = SPATIAL_DIM_0;
 85:   const int N_b    = numBasisFunctions_0;           // The number of basis functions
 86:   const int N_comp = numBasisComponents_0;          // The number of basis function components
 87:   const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions
 88:   const int N_q    = numQuadraturePoints_0;         // The number of quadrature points
 89:   const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously
 90:   const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl
 91:   const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)
 92:   const int N_c    = N_cb * N_bc;
 93:   const int N_sbc  = N_bst / (N_q * N_comp);
 94:   const int N_sqc  = N_bst / N_bt;
 95:   /* Calculated indices */
 96:   const int tidx    = threadIdx.x + blockDim.x*threadIdx.y;
 97:   const int blidx   = tidx / N_bst;                  // Block number for this thread
 98:   const int bidx    = tidx % N_bt;                   // Basis function mapped to this thread
 99:   const int cidx    = tidx % N_comp;                 // Basis component mapped to this thread
100:   const int qidx    = tidx % N_q;                    // Quadrature point mapped to this thread
101:   const int blbidx  = tidx % N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase
102:   const int blqidx  = tidx % N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase
103:   const int gidx    = blockIdx.y*gridDim.x + blockIdx.x;
104:   const int Goffset = gidx*N_c;
105:   const int Coffset = gidx*N_c*N_bt;
106:   const int Poffset = gidx*N_c*N_q;
107:   const int Eoffset = gidx*N_c*N_bt;
108:   /* Quadrature data */
109:   realType             w;                   // $w_q$, Quadrature weight at $x_q$
110: //__shared__ realType  phi_i[N_bt*N_q];     // $\phi_i(x_q)$, Value of the basis function $i$ at $x_q$
111:   __shared__ vecType   phiDer_i[N_bt*N_q];  // $\frac{\partial\phi_i(x_q)}{\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$
112:   /* Geometric data */
113:   __shared__ realType  detJ[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$
114:   __shared__ realType  invJ[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$
115:   /* FEM data */
116:   __shared__ realType  u_i[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\mathcal{T}} = \sum_i u_i \phi_i$
117:   /* Physical coefficient data */
118:   realType             kappa;               // Physical coefficient $\kappa$ in the equation
119:   /* Intermediate calculations */
120: //__shared__ realType  f_0[N_t*N_sqc];      // $f_0(u(x_q), \nabla u(x_q)) |J(x_q)| w_q$
121:   __shared__ vecType   f_1[N_t*N_sqc];      // $f_1(u(x_q), \nabla u(x_q)) |J(x_q)| w_q$
122:   /* Output data */
123:   realType             e_i;                 // Coefficient $e_i$ of the residual

125:   /* These should be generated inline */
126:   /* Load quadrature weights */
127:   w = weights_0[qidx];
128:   /* Load basis tabulation \phi_i for this cell */
129:   if (tidx < N_bt*N_q) {
130:  // phi_i[tidx]    = Basis_0[tidx];
131:     phiDer_i[tidx] = BasisDerivatives_0[tidx];
132:   }

134:   for (int batch = 0; batch < N_cb; ++batch) {
135:     /* Load geometry */
136:     detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];
137:     for (int n = 0; n < dim*dim; ++n) {
138:       const int offset = n*N_t;
139:       invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];
140:     }
141:     /* Load coefficients u_i for this cell */
142:     for (int n = 0; n < N_bt; ++n) {
143:       const int offset = n*N_t;
144:       u_i[offset+tidx] = coefficients[Coffset+batch*N_t*N_b+offset+tidx];
145:     }
146:     /* Load physical coefficient for this cell */
147:     kappa = physCoefficients[Poffset+batch*N_t*N_q+tidx];

149:     /* Map coefficients to values at quadrature points */
150:     for (int c = 0; c < N_sqc; ++c) {
151:       realType u[N_comp];      // $u(x_q)$, Value of the field at $x_q$
152:       vecType  gradU[N_comp];  // $\nabla u(x_q)$, Value of the field gradient at $x_q$
153:       // vecType   x             = {0.0, 0.0};           // Quadrature point $x_q$
154:       const int cell = c*N_bl*N_b + blqidx;
155:       const int fidx = (cell*N_q + qidx)*N_comp + cidx;

157:       for (int comp = 0; comp < N_comp; ++comp) {
158:         //u[comp] = 0.0;
159: #if SPATIAL_DIM_0 == 2
160:         gradU[comp].x = 0.0; gradU[comp].y = 0.0;
161: #elif  SPATIAL_DIM_0 == 3
162:         gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;
163: #endif
164:       }
165:       /* Get field and derivatives at this quadrature point */
166:       for (int i = 0; i < N_b; ++i) {
167:         for (int comp = 0; comp < N_comp; ++comp) {
168:           const int b    = i*N_comp+comp;
169:           const int pidx = qidx*N_bt + b;
170:           const int uidx = cell*N_bt + b;
171:           vecType   realSpaceDer;

173:           // u[comp] += u_i[uidx]*phi_i[qidx*N_bt+bbidx];
174: #if SPATIAL_DIM_0 == 2
175:           realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;
176:           gradU[comp].x += u_i[uidx]*realSpaceDer.x;
177:           realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;
178:           gradU[comp].y += u_i[uidx]*realSpaceDer.y;
179: #elif  SPATIAL_DIM_0 == 3
180:           realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;
181:           gradU[comp].x += u_i[uidx]*realSpaceDer.x;
182:           realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;
183:           gradU[comp].y += u_i[uidx]*realSpaceDer.y;
184:           realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;
185:           gradU[comp].z += u_i[uidx]*realSpaceDer.z;
186: #endif
187:         }
188:       }
189:       /* Process values at quadrature points */
190:       f_1[fidx] = f1_coef_func(u, gradU, kappa, cidx);
191: #if SPATIAL_DIM_0 == 2
192:       f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;
193: #elif  SPATIAL_DIM_0 == 3
194:       f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;
195: #endif
196:     }

198:     /* ==== TRANSPOSE THREADS ==== */
199:     __syncthreads();

201:     /* Map values at quadrature points to coefficients */
202:     for (int c = 0; c < N_sbc; ++c) {
203:       const int cell = c*N_bl*N_q + blbidx;

205:       e_i = 0.0;
206:       for (int q = 0; q < N_q; ++q) {
207:         const int pidx = q*N_bt + bidx;
208:         const int fidx = (cell*N_q + q)*N_comp + cidx;
209:         vecType   realSpaceDer;

211:         // e_i += phi_i[pidx]*f_0[fidx];
212: #if SPATIAL_DIM_0 == 2
213:         realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;
214:         e_i           += realSpaceDer.x*f_1[fidx].x;
215:         realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;
216:         e_i           += realSpaceDer.y*f_1[fidx].y;
217: #elif  SPATIAL_DIM_0 == 3
218:         realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;
219:         e_i           += realSpaceDer.x*f_1[fidx].x;
220:         realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;
221:         e_i           += realSpaceDer.y*f_1[fidx].y;
222:         realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;
223:         e_i           += realSpaceDer.z*f_1[fidx].z;
224: #endif
225:       }
226: #if 0
227:       // Check f_1
228:       {
229:         const int q = 0;
230:         const int i = bidx/N_comp;
231:         // Prints f1[0].x, f1[1].x, f1[0].y, f1[1].y
232:         switch (i) {
233:         case 0:
234:           e_i = f_1[(cell*N_q+q)*N_comp+cidx].x;break;
235:         case 1:
236:           e_i = f_1[(cell*N_q+q)*N_comp+cidx].y;break;
237:         //case 2:
238:         //e_i = f_1[(cell*N_q+q)*N_comp+cidx].z;break;
239:         default:
240:           e_i = 0.0;
241:         }
242:       }
243:       // Check that u_i is being used correctly
244:       //e_i = u_i[cell*N_bt+bidx];
245:       e_i = detJ[cell];
246:       //e_i = coefficients[Coffset+(batch*N_sbc+c)*N_t+tidx];
247:       //e_i = Coffset+(batch*N_sbc+c)*N_t+tidx;
248:       //e_i = cell*N_bt+bidx;
249: #endif
250:       /* Write element vector for N_{cbc} cells at a time */
251:       elemVec[Eoffset+(batch*N_sbc+c)*N_t+tidx] = e_i;
252:     }
253:     /* ==== Could do one write per batch ==== */
254:   }
255:   return;
256: }

258: // Calculate a conforming thread grid for N kernels
261: PetscErrorCode calculateGridCoef(const int N, const int blockSize, unsigned int& x, unsigned int& y, unsigned int& z)
262: {
264:   z = 1;
265:   if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
266:   const int Nblocks = N/blockSize;
267:   for (x = (int) (sqrt(Nblocks) + 0.5); x > 0; --x) {
268:     y = Nblocks/x;
269:     if (x*y == Nblocks) break;
270:   }
271:   if (x*y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
272:   return(0);
273: }

277: /*
278:   IntegrateElementCoefBatchGPU - Produces element vectors from input element solution and geometric information via quadrature

280:   Input Parameters:
281: + Ne - The total number of cells, Nchunk * Ncb * Nbc
282: . Ncb - The number of serial cell batches
283: . Nbc - The number of cells per batch
284: . Nbl - The number of concurrent cells blocks per thread block
285: . coefficients - An array of the solution vector for each cell
286: . physCoefficients - An array of the physical coefficient values at quadrature points for each cell
287: . jacobianInverses - An array of the inverse Jacobian for each cell
288: . jacobianDeterminants - An array of the Jacobian determinant for each cell
289: . event - A PetscEvent, used to log flops
290: - debug - A flag for debugging information

292:   Output Parameter:
293: . elemVec - An array of the element vectors for each cell
294: */
295: PETSC_EXTERN PetscErrorCode IntegrateElementCoefBatchGPU(PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt Nbl, const PetscScalar coefficients[], const PetscScalar physCoefficients[],
296:                                             const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[],
297:                                             PetscLogEvent event, PetscInt debug)
298: {
299:   #include "ex52_gpu_inline.h"
300:   const int dim    = SPATIAL_DIM_0;
301:   const int N_b    = numBasisFunctions_0;   // The number of basis functions
302:   const int N_comp = numBasisComponents_0;  // The number of basis function components
303:   const int N_bt   = N_b*N_comp;            // The total number of scalar basis functions
304:   const int N_q    = numQuadraturePoints_0; // The number of quadrature points
305:   const int N_bst  = N_bt*N_q;              // The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously
306:   const int N_t    = N_bst*N_bl;            // The number of threads, N_bst * N_bl

308:   realType       *d_coefficients;
309:   realType       *d_physCoefficients;
310:   realType       *d_jacobianInverses;
311:   realType       *d_jacobianDeterminants;
312:   realType       *d_elemVec;

316:   if (Nbl != N_bl) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inconsisten block size %d should be %d", Nbl, N_bl);
317:   if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp);
318:   if (!Ne) {
319:     PetscStageLog     stageLog;
320:     PetscEventPerfLog eventLog = NULL;
321:     PetscInt          stage;

323:     PetscLogGetStageLog(&stageLog);
324:     PetscStageLogGetCurrent(stageLog, &stage);
325:     PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
326:     /* Log performance info */
327:     eventLog->eventInfo[event].count++;
328:     eventLog->eventInfo[event].time  += 0.0;
329:     eventLog->eventInfo[event].flops += 0;
330:     return(0);
331:   }
332:   // Marshalling
333:   cudaMalloc((void**) &d_coefficients,         Ne*N_bt * sizeof(realType));
334:   cudaMalloc((void**) &d_physCoefficients,     Ne*N_bt * sizeof(realType));
335:   cudaMalloc((void**) &d_jacobianInverses,     Ne*dim*dim * sizeof(realType));
336:   cudaMalloc((void**) &d_jacobianDeterminants, Ne * sizeof(realType));
337:   cudaMalloc((void**) &d_elemVec,              Ne*N_bt * sizeof(realType));
338:   if (sizeof(PetscReal) == sizeof(realType)) {
339:     cudaMemcpy(d_coefficients,         coefficients,         Ne*N_bt    * sizeof(realType), cudaMemcpyHostToDevice);
340:     cudaMemcpy(d_physCoefficients,     physCoefficients,     Ne*N_q     * sizeof(realType), cudaMemcpyHostToDevice);
341:     cudaMemcpy(d_jacobianInverses,     jacobianInverses,     Ne*dim*dim * sizeof(realType), cudaMemcpyHostToDevice);
342:     cudaMemcpy(d_jacobianDeterminants, jacobianDeterminants, Ne         * sizeof(realType), cudaMemcpyHostToDevice);
343:   } else {
344:     realType *c, *pc, *jI, *jD;
345:     PetscInt i;

347:     PetscMalloc4(Ne*N_bt,realType,&c,Ne*N_q,realType,&pc,Ne*dim*dim,realType,&jI,Ne,realType,&jD);
348:     for (i = 0; i < Ne*N_bt;    ++i) c[i]  = coefficients[i];
349:     for (i = 0; i < Ne*N_q;     ++i) pc[i] = physCoefficients[i];
350:     for (i = 0; i < Ne*dim*dim; ++i) jI[i] = jacobianInverses[i];
351:     for (i = 0; i < Ne;         ++i) jD[i] = jacobianDeterminants[i];
352:     cudaMemcpy(d_coefficients,         c,  Ne*N_bt    * sizeof(realType), cudaMemcpyHostToDevice);
353:     cudaMemcpy(d_physCoefficients,     pc, Ne*N_q     * sizeof(realType), cudaMemcpyHostToDevice);
354:     cudaMemcpy(d_jacobianInverses,     jI, Ne*dim*dim * sizeof(realType), cudaMemcpyHostToDevice);
355:     cudaMemcpy(d_jacobianDeterminants, jD, Ne         * sizeof(realType), cudaMemcpyHostToDevice);
356:     PetscFree4(c,pc,jI,jD);
357:   }
358:   // Kernel launch
359:   unsigned int x, y, z;
360:   calculateGridCoef(Ne, Ncb*Nbc, x, y, z);
361:   dim3 grid(x, y, z);
362:   dim3 block(Nbc * N_comp, 1, 1);
363:   cudaEvent_t start, stop;
364:   float       msElapsedTime;

366:   cudaEventCreate(&start);
367:   cudaEventCreate(&stop);
368:   // if (debug) {
369:   PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n",
370:                      grid.x, grid.y, grid.z, block.x, block.y, block.z, Ncb);
371:   PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb);
372:   // }
373:   cudaEventRecord(start, 0);
374:   integrateElementCoefQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_physCoefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec);
375:   cudaEventRecord(stop, 0);
376:   cudaEventSynchronize(stop);
377:   cudaEventElapsedTime(&msElapsedTime, start, stop);
378:   cudaEventDestroy(start);
379:   cudaEventDestroy(stop);
380:   // Marshalling
381:   if (sizeof(PetscReal) == sizeof(realType)) {
382:     cudaMemcpy(elemVec, d_elemVec, Ne*N_bt * sizeof(realType), cudaMemcpyDeviceToHost);
383:   } else {
384:     realType *eV;
385:     PetscInt i;

387:     PetscMalloc(Ne*N_bt * sizeof(realType), &eV);
388:     cudaMemcpy(eV, d_elemVec, Ne*N_bt * sizeof(realType), cudaMemcpyDeviceToHost);
389:     for (i = 0; i < Ne*N_bt; ++i) elemVec[i] = eV[i];
390:     PetscFree(eV);
391:   }
392:   cudaFree(d_coefficients);
393:   cudaFree(d_jacobianInverses);
394:   cudaFree(d_jacobianDeterminants);
395:   cudaFree(d_elemVec);
396:   {
397:     PetscStageLog     stageLog;
398:     PetscEventPerfLog eventLog = NULL;
399:     PetscInt          stage;

401:     PetscLogGetStageLog(&stageLog);
402:     PetscStageLogGetCurrent(stageLog, &stage);
403:     PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
404:     /* Log performance info */
405:     eventLog->eventInfo[event].count++;
406:     eventLog->eventInfo[event].time  += msElapsedTime*1.0e-3;
407:     eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne;
408:   }
409:   return(0);
410: }