Actual source code: ex4.c

petsc-3.3-p7 2013-05-11
  1: static char help[] = "Test MatSetValuesBatch: setting batches of elements using the GPU.\n\
  2: This works with SeqAIJCUSP and MPIAIJCUSP matrices.\n\n";
  3: #include <petscdmda.h>
  4: #include <petscksp.h>

  6: /* We will use a structured mesh for this assembly test. Each square will be divided into two triangles:
  7:   C       D
  8:    _______
  9:   |\      | The matrix for 0 and 1 is /   1  -0.5 -0.5 \
 10:   | \   1 |                           | -0.5  0.5  0.0 |
 11:   |  \    |                           \ -0.5  0.0  0.5 /
 12:   |   \   |
 13:   |    \  |
 14:   |  0  \ |
 15:   |      \|
 16:   ---------
 17:   A       B

 19: TO ADD:
 20:   DONE 1) Build and run on baconost
 21:     - Gather data for CPU/GPU up to da_grid_x 1300
 22:       - Looks 6x faster than CPU
 23:     - Make plot

 25:   DONE 2) Solve the Neumann Poisson problem

 27:   3) Multi-GPU Assembly
 28:     - MPIAIJCUSP: Just have two SEQAIJCUSP matrices, nothing else special
 29:     a) Filter rows to be sent to other procs (normally stashed)
 30:     b) send/recv rows, might as well do with a VecScatter
 31:     c) Potential to overlap this computation w/ GPU (talk to Nathan)
 32:     c') Just shove these rows in after the local
 33:     d) Have implicit rep of COO from repeated/tiled_range
 34:     e) Do a filtered copy, decrementing rows and remapping columns, which splits into two sets
 35:     f) Make two COO matrices and do separate aggregation on each one

 37:   4) Solve the Neumann Poisson problem in parallel
 38:     - Try it on GPU machine at Brown (They need another GNU install)

 40:   5) GPU FEM integration
 41:     - Move launch code to PETSc   or   - Try again now that assembly is in PETSc
 42:     - Move build code to PETSc

 44:   6) Try out CUSP PCs
 45: */

 49: PetscErrorCode IntegrateCells(DM dm, PetscInt *Ne, PetscInt *Nl, PetscInt **elemRows, PetscScalar **elemMats) {
 50:   DMDALocalInfo  info;
 51:   PetscInt      *er;
 52:   PetscScalar   *em;
 53:   PetscInt       X, Y, dof;
 54:   PetscInt       nl, nxe, nye, ne;
 55:   PetscInt       k  = 0, m  = 0;
 56:   PetscInt       i, j;
 57:   PetscLogEvent  integrationEvent;

 61:   PetscLogEventRegister("ElemIntegration", DM_CLASSID, &integrationEvent);
 62:   PetscLogEventBegin(integrationEvent,0,0,0,0);
 63:   DMDAGetInfo(dm, 0, &X, &Y,0,0,0,0, &dof,0,0,0,0,0);
 64:   DMDAGetLocalInfo(dm, &info);
 65:   nl   = dof*3;
 66:   nxe  = info.xm; if (info.xs+info.xm == X) nxe--;
 67:   nye  = info.ym; if (info.ys+info.ym == Y) nye--;
 68:   ne   = 2 * nxe * nye;
 69:   *Ne  = ne;
 70:   *Nl  = nl;
 71:   PetscMalloc2(ne*nl, PetscInt, elemRows, ne*nl*nl, PetscScalar, elemMats);
 72:   er   = *elemRows;
 73:   em   = *elemMats;
 74:   // Proc 0        Proc 1
 75:   // xs: 0  xm: 3  xs: 0 xm: 3
 76:   // ys: 0  ym: 2  ys: 2 ym: 1
 77:   // 8 elements x 3 vertices = 24 element matrix rows and 72 entries
 78:   //   6 offproc rows containing 18 element matrix entries
 79:   //  18  onproc rows containing 54 element matrix entries
 80:   //   3 offproc columns in 8 element matrix entries
 81:   //   so we should have 46 diagonal matrix entries
 82:   for(j = info.ys; j < info.ys+nye; ++j) {
 83:     for(i = info.xs; i < info.xs+nxe; ++i) {
 84:       PetscInt rowA = j*X     + i, rowB = j*X     + i+1;
 85:       PetscInt rowC = (j+1)*X + i, rowD = (j+1)*X + i+1;

 87:       /* Lower triangle */
 88:       er[k+0] = rowA; em[m+0*nl+0] =  1.0; em[m+0*nl+1] = -0.5; em[m+0*nl+2] = -0.5;
 89:       er[k+1] = rowB; em[m+1*nl+0] = -0.5; em[m+1*nl+1] =  0.5; em[m+1*nl+2] =  0.0;
 90:       er[k+2] = rowC; em[m+2*nl+0] = -0.5; em[m+2*nl+1] =  0.0; em[m+2*nl+2] =  0.5;
 91:       k += nl; m += nl*nl;
 92:       /* Upper triangle */
 93:       er[k+0] = rowD; em[m+0*nl+0] =  1.0; em[m+0*nl+1] = -0.5; em[m+0*nl+2] = -0.5;
 94:       er[k+1] = rowC; em[m+1*nl+0] = -0.5; em[m+1*nl+1] =  0.5; em[m+1*nl+2] =  0.0;
 95:       er[k+2] = rowB; em[m+2*nl+0] = -0.5; em[m+2*nl+1] =  0.0; em[m+2*nl+2] =  0.5;
 96:       k += nl; m += nl*nl;
 97:     }
 98:   }
 99:   PetscLogEventEnd(integrationEvent,0,0,0,0);
100:   return(0);
101: }

105: int main(int argc, char **argv)
106: {
107:   KSP            ksp;
108:   MatNullSpace   nullsp;
109:   DM             dm;
110:   Mat            A;
111:   Vec            x, b;
112:   PetscViewer    viewer;
113:   PetscInt       Nl, Ne;
114:   PetscInt      *elemRows;
115:   PetscScalar   *elemMats;
116:   PetscBool      doGPU = PETSC_TRUE, doCPU = PETSC_TRUE, doSolve = PETSC_FALSE, doView = PETSC_TRUE;
117:   PetscLogStage  gpuStage, cpuStage;

120:   PetscInitialize(&argc, &argv, 0, help);
121:   DMDACreate2d(PETSC_COMM_WORLD, DMDA_BOUNDARY_NONE, DMDA_BOUNDARY_NONE, DMDA_STENCIL_BOX, -3, -3, PETSC_DECIDE, PETSC_DECIDE, 1, 1, PETSC_NULL, PETSC_NULL, &dm);
122:   IntegrateCells(dm, &Ne, &Nl, &elemRows, &elemMats);
123:   PetscOptionsGetBool(PETSC_NULL, "-view", &doView, PETSC_NULL);
124:   /* Construct matrix using GPU */
125:   PetscOptionsGetBool(PETSC_NULL, "-gpu", &doGPU, PETSC_NULL);
126:   if (doGPU) {
127:     PetscLogStageRegister("GPU Stage", &gpuStage);
128:     PetscLogStagePush(gpuStage);
129:     DMCreateMatrix(dm, MATAIJ, &A);
130:     MatSetType(A, MATAIJCUSP);
131:     MatSeqAIJSetPreallocation(A, 0, PETSC_NULL);
132:     MatMPIAIJSetPreallocation(A, 0, PETSC_NULL, 0, PETSC_NULL);
133:     MatSetValuesBatch(A, Ne, Nl, elemRows, elemMats);
134:     MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
135:     MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
136:     if (doView) {
137:       PetscViewerASCIIOpen(PETSC_COMM_WORLD, PETSC_NULL, &viewer);
138:       if (Ne > 500) {PetscViewerPushFormat(viewer, PETSC_VIEWER_ASCII_INFO);}
139:       MatView(A, viewer);
140:       PetscViewerDestroy(&viewer);
141:     }
142:     PetscLogStagePop();
143:     MatDestroy(&A);
144:   }
145:   /* Construct matrix using CPU */
146:   PetscOptionsGetBool(PETSC_NULL, "-cpu", &doCPU, PETSC_NULL);
147:   if (doCPU) {
148:     PetscLogStageRegister("CPU Stage", &cpuStage);
149:     PetscLogStagePush(cpuStage);
150:     DMCreateMatrix(dm, MATAIJ, &A);
151:     MatZeroEntries(A);
152:     MatSetValuesBatch(A, Ne, Nl, elemRows, elemMats);
153:     MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
154:     MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
155:     if (doView) {
156:       PetscViewerASCIIOpen(PETSC_COMM_WORLD, PETSC_NULL, &viewer);
157:       if (Ne > 500) {PetscViewerPushFormat(viewer, PETSC_VIEWER_ASCII_INFO);}
158:       MatView(A, viewer);
159:       PetscViewerDestroy(&viewer);
160:     }
161:     PetscLogStagePop();
162:   }
163:   /* Solve simple system with random rhs */
164:   PetscOptionsGetBool(PETSC_NULL, "-solve", &doSolve, PETSC_NULL);
165:   if (doSolve) {
166:     MatGetVecs(A, &x, &b);
167:     VecSetRandom(b, PETSC_NULL);
168:     KSPCreate(PETSC_COMM_WORLD, &ksp);
169:     KSPSetOperators(ksp, A, A, DIFFERENT_NONZERO_PATTERN);
170:     MatNullSpaceCreate(PETSC_COMM_WORLD, PETSC_TRUE, 0, PETSC_NULL, &nullsp);
171:     KSPSetNullSpace(ksp, nullsp);
172:     MatNullSpaceDestroy(&nullsp);
173:     KSPSetFromOptions(ksp);
174:     KSPSolve(ksp, b, x);
175:     VecDestroy(&x);
176:     VecDestroy(&b);
177:     /* Solve physical system:

179:          -\Delta u = -6 (x + y - 1)

181:        where u = x^3 - 3/2 x^2 + y^3 - 3/2y^2 + 1/2,
182:        so \Delta u = 6 x - 3 + 6 y - 3,
183:        and \frac{\partial u}{\partial n} = {3x (x - 1), 3y (y - 1)} \cdot n
184:                                          = \pm 3x (x - 1) at x=0,1 = 0
185:                                          = \pm 3y (y - 1) at y=0,1 = 0
186:     */
187:   }
188:   /* Cleanup */
189:   MatDestroy(&A);
190:   PetscFree2(elemRows, elemMats);
191:   DMDestroy(&dm);
192:   PetscFinalize();
193:   return 0;
194: }