Actual source code: ex2.c

petsc-3.3-p7 2013-05-11
  2: static char help[] = "Reads a a simple unstructured grid from a file. Partitions it,\n\
  3: and distributes the grid data accordingly\n\n";

  5: /*T
  6:    Concepts: Mat^partitioning a matrix;
  7:    Processors: n
  8: T*/

 10: /*
 11:     Updates of this example MAY be found at 
 12:        http://www.mcs.anl.gov/petsc/src/dm/ao/examples/tutorials/ex2.c

 14:     This is a very basic, even crude, example of managing an unstructured
 15:     grid in parallel.

 17:     This particular code is for a Galerkin-style finite element method. 

 19:     After the calls below, each processor will have 
 20:       1) a list of elements it "owns"; for each "owned" element it will have the global 
 21:          numbering of the three vertices; stored in gdata->ele;
 22:       2) a list of vertices it "owns". For each owned it will have the x and y 
 23:          coordinates; stored in gdata->vert

 25:     It will not have 
 26:       1) list of ghost elements (since they are not needed for traditional 
 27:          Galerkin style finite element methods). For various finite volume methods
 28:          you may need the ghost element lists, these may be generated using the 
 29:          element neighbor information given in the file database.

 31:     To compute the local element stiffness matrix and load vector, each processor
 32:     will need the vertex coordinates for all of the vertices on the locally 
 33:     "owned" elements.  This data could be obtained by doing the appropriate vector
 34:     scatter on the data stored in gdata->vert; we haven't had time to demonstrate this.

 36:     Clearly writing a complete parallel unstructured grid code with PETSc is still
 37:     a good deal of work and requires a lot of application level coding.  BUT, at least
 38:     PETSc can manage all of the nonlinear and linear solvers (including matrix assembly 
 39:     etc.), which allows the programmer to concentrate his or her efforts on managing 
 40:     the unstructured grid. The PETSc team is developing additional library objects
 41:     to help manage parallel unstructured grid computations.  Unfortunately we have 
 42:     not had time to complete these yet, so the application programmer still must
 43:     manage much of the parallel grid manipulation as indicated below.

 45: */

 47: /* 
 48:   Include "petscmat.h" so that we can use matrices.
 49:   automatically includes:
 50:      petscsys.h       - base PETSc routines   petscvec.h    - vectors
 51:      petscmat.h    - matrices
 52:      petscis.h     - index sets            petscviewer.h - viewers               

 54:   Include "petscao.h" allows use of the AO (application ordering) commands,
 55:   used below for renumbering the vertex numbers after the partitioning.

 57:   Include "petscbt.h" for managing logical bit arrays that are used to 
 58:   conserve space. Note that the code does use order N bit arrays on each 
 59:   processor so is theoretically not scalable, but even with 64 million 
 60:   vertices it will only need temporarily 8 megabytes of memory for the 
 61:   bit array so one can still do very large problems with this approach, 
 62:   since the bit arrays are freed before the vectors and matrices are
 63:   created.
 64: */
 65: #include <petscmat.h>
 66: #include <petscao.h>
 67: #include <petscbt.h>

 69: /* 
 70:     This is the user-defined grid data context 
 71: */
 72: typedef struct {
 73:   PetscInt    n_vert,n_ele;
 74:   PetscInt    mlocal_vert,mlocal_ele;
 75:   PetscInt    *ele;
 76:   PetscScalar *vert;
 77:   PetscInt    *ia,*ja;
 78:   IS     isnewproc;
 79:   PetscInt    *localvert,nlocal; /* used to stash temporarily old global vertex number of new vertex */
 80: } GridData;

 82: /*
 83:    Variables on all processors:
 84:       n_vert          - total number of vertices
 85:       mlocal_vert     - number of vertices on this processor
 86:       vert            - x,y coordinates of local vertices

 88:       n_ele           - total number of elements
 89:       mlocal_ele      - number of elements on this processor
 90:       ele             - vertices of elements on this processor

 92:       ia, ja          - adjacency graph of elements (for partitioning)
 93:     
 94:    Variables on processor 0 during data reading from file:
 95:       mmlocal_vert[i] - number of vertices on each processor
 96:       tmpvert         - x,y coordinates of vertices on any processor (as read in)

 98:       mmlocal_ele[i]  - number of elements on each processor

100:       tmpia, tmpja    - adjacency graph of elements for other processors

102:    Notes:
103:    The code below has a great deal of IO (print statements). This is to allow one to track 
104:    the renumbering and movement of data among processors. In an actual 
105:    production run, IO of this type would be deactivated.

107:    To use the ParMETIS partitioner run with the option -mat_partitioning_type parmetis
108:    otherwise it defaults to the initial element partitioning induced when the data 
109:    is read in.

111:    To understand the parallel performance of this type of code, it is important
112:    to profile the time spent in various events in the code; running with the
113:    option -log_summary will indicate how much time is spent in the routines 
114:    below.   Of course, for very small problems, such as the sample grid used here,
115:    the profiling results are meaningless.
116: */

118: extern PetscErrorCode DataRead(GridData *);
119: extern PetscErrorCode DataPartitionElements(GridData *);
120: extern PetscErrorCode DataMoveElements(GridData *);
121: extern PetscErrorCode DataPartitionVertices(GridData *);
122: extern PetscErrorCode DataMoveVertices(GridData *);
123: extern PetscErrorCode DataDestroy(GridData *);

127: int main(int argc,char **args)
128: {
130: #if defined(PETSC_USE_LOG)
131:   PetscLogEvent  READ_EVENT,PARTITION_ELEMENT_EVENT,MOVE_ELEMENT_EVENT;
132:   PetscLogEvent  PARTITION_VERTEX_EVENT,MOVE_VERTEX_EVENT;
133: #endif
134:   GridData       gdata;

136:   PetscInitialize(&argc,&args,(char *)0,help);

138:   PetscLogEventRegister("Read Data",0,&READ_EVENT);
139:   PetscLogEventRegister("Partition elemen",0,&PARTITION_ELEMENT_EVENT);
140:   PetscLogEventRegister("Move elements",0,&MOVE_ELEMENT_EVENT);
141:   PetscLogEventRegister("Partition vertic",0,&PARTITION_VERTEX_EVENT);
142:   PetscLogEventRegister("Move vertices",0,&MOVE_VERTEX_EVENT);

144:   PetscLogEventBegin(READ_EVENT,0,0,0,0);
145:   DataRead(&gdata);
146:   PetscLogEventEnd(READ_EVENT,0,0,0,0);
147:   PetscLogEventBegin(PARTITION_ELEMENT_EVENT,0,0,0,0);
148:   DataPartitionElements(&gdata);
149:   PetscLogEventEnd(PARTITION_ELEMENT_EVENT,0,0,0,0);
150:   PetscLogEventBegin(MOVE_ELEMENT_EVENT,0,0,0,0);
151:   DataMoveElements(&gdata);
152:   PetscLogEventEnd(MOVE_ELEMENT_EVENT,0,0,0,0);
153:   PetscLogEventBegin(PARTITION_VERTEX_EVENT,0,0,0,0);
154:   DataPartitionVertices(&gdata);
155:   PetscLogEventEnd(PARTITION_VERTEX_EVENT,0,0,0,0);
156:   PetscLogEventBegin(MOVE_VERTEX_EVENT,0,0,0,0);
157:   DataMoveVertices(&gdata);
158:   PetscLogEventEnd(MOVE_VERTEX_EVENT,0,0,0,0);
159:   DataDestroy(&gdata);

161:   PetscFinalize();
162:   return 0;
163: }


168: /*
169:      Reads in the grid data from a file; each processor is naively 
170:   assigned a continuous chunk of vertex and element data. Later the data
171:   will be partitioned and moved to the appropriate processor.
172: */
173: PetscErrorCode DataRead(GridData *gdata)
174: {
175:   PetscMPIInt    rank,size;
176:   PetscInt       n_vert,*mmlocal_vert,mlocal_vert,i,*ia,*ja,cnt,j;
177:   PetscInt       mlocal_ele,*mmlocal_ele,*ele,*tmpele,n_ele,net,a1,a2,a3;
178:   PetscInt       *iatmp,*jatmp;
180:   char           msg[128];
181:   PetscScalar    *vert,*tmpvert;
182:   MPI_Status     status;

185:   /*
186:      Processor 0 opens the file, reads in chunks of data and sends a portion off to
187:    each other processor.

189:      Note: For a truely scalable IO portion of the code, one would store
190:    the grid data in a binary file and use MPI-IO commands to have each 
191:    processor read in the parts that it needs. However in most circumstances
192:    involving up to a say a million nodes and 100 processors this approach 
193:    here is fine.
194:   */
195:   MPI_Comm_size(PETSC_COMM_WORLD,&size);
196:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

198:   if (!rank) {
199:     FILE *fd;
200:     fd = fopen("usgdata","r"); if (!fd) SETERRQ(PETSC_COMM_SELF,1,"Cannot open grid file");

202:     /* read in number of vertices */
203:     fgets(msg,128,fd);
204:     printf("File msg:%s",msg);
205:     fscanf(fd,"Number Vertices = %d\n",&n_vert);
206:     printf("Number of grid vertices %d\n",n_vert);

208:     /* broadcast number of vertices to all processors */
209:     MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
210:     mlocal_vert  = n_vert/size + ((n_vert % size) > 0);

212:     /* 
213:       allocate enough room for the first processor to keep track of how many 
214:       vertices are assigned to each processor. Splitting vertices equally amoung
215:       all processors.
216:     */
217:     PetscMalloc(size*sizeof(PetscInt),&mmlocal_vert);
218:     for (i=0; i<size; i++) {
219:       mmlocal_vert[i] = n_vert/size + ((n_vert % size) > i);
220:       printf("Processor %d assigned %d vertices\n",i,mmlocal_vert[i]);
221:     }

223:     /*
224:        Read in vertices assigned to first processor
225:     */
226:     PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&vert);
227:     printf("Vertices assigned to processor 0\n");
228:     for (i=0; i<mlocal_vert; i++) {
229:       fscanf(fd,"%d %lf %lf\n",&cnt,vert+2*i,vert+2*i+1);
230:       printf("%d %g %g\n",cnt,PetscRealPart(vert[2*i]),PetscRealPart(vert[2*i+1]));
231:     }

233:     /* 
234:        Read in vertices for all the other processors 
235:     */
236:     PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&tmpvert);
237:     for (j=1; j<size; j++) {
238:       printf("Vertices assigned to processor %d\n",j);
239:       for (i=0; i<mmlocal_vert[j]; i++) {
240:         fscanf(fd,"%d %lf %lf\n",&cnt,tmpvert+2*i,tmpvert+2*i+1);
241:         printf("%d %g %g\n",cnt,tmpvert[2*i],tmpvert[2*i+1]);
242:       }
243:       MPI_Send(tmpvert,2*mmlocal_vert[j],MPIU_SCALAR,j,0,PETSC_COMM_WORLD);
244:     }
245:     PetscFree(tmpvert);
246:     PetscFree(mmlocal_vert);

248:     fscanf(fd,"Number Elements = %d\n",&n_ele);
249:     printf("Number of grid elements %d\n",n_ele);

251:     /* 
252:        Broadcast number of elements to all processors
253:     */
254:     MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
255:     mlocal_ele  = n_ele/size + ((n_ele % size) > 0);

257:     /* 
258:       Allocate enough room for the first processor to keep track of how many 
259:       elements are assigned to each processor.
260:     */
261:     PetscMalloc(size*sizeof(PetscInt),&mmlocal_ele);
262:     for (i=0; i<size; i++) {
263:       mmlocal_ele[i] = n_ele/size + ((n_ele % size) > i);
264:       printf("Processor %d assigned %d elements\n",i,mmlocal_ele[i]);
265:     }
266: 
267:     /*
268:         read in element information for the first processor
269:     */
270:     PetscMalloc(3*mmlocal_ele[0]*sizeof(PetscInt),&ele);
271:     printf("Elements assigned to processor 0\n");
272:     for (i=0; i<mlocal_ele; i++) {
273:       fscanf(fd,"%d %d %d %d\n",&cnt,ele+3*i,ele+3*i+1,ele+3*i+2);
274:       printf("%d %d %d %d\n",cnt,ele[3*i],ele[3*i+1],ele[3*i+2]);
275:     }

277:     /* 
278:        Read in elements for all the other processors 
279:     */
280:     PetscMalloc(3*mmlocal_ele[0]*sizeof(PetscInt),&tmpele);
281:     for (j=1; j<size; j++) {
282:       printf("Elements assigned to processor %d\n",j);
283:       for (i=0; i<mmlocal_ele[j]; i++) {
284:         fscanf(fd,"%d %d %d %d\n",&cnt,tmpele+3*i,tmpele+3*i+1,tmpele+3*i+2);
285:         printf("%d %d %d %d\n",cnt,tmpele[3*i],tmpele[3*i+1],tmpele[3*i+2]);
286:       }
287:       MPI_Send(tmpele,3*mmlocal_ele[j],MPI_INT,j,0,PETSC_COMM_WORLD);
288:     }
289:     PetscFree(tmpele);

291:     /* 
292:          Read in element neighbors for processor 0 
293:          We don't know how many spaces in ja[] to allocate so we allocate 
294:        3*the number of local elements, this is the maximum it could be
295:     */
296:     PetscMalloc((mlocal_ele+1)*sizeof(PetscInt),&ia);
297:     PetscMalloc((3*mlocal_ele+1)*sizeof(PetscInt),&ja);
298:     net   = 0;
299:     ia[0] = 0;
300:     printf("Element neighbors on processor 0\n");
301:     fgets(msg,128,fd);
302:     for (i=0; i<mlocal_ele; i++) {
303:       fscanf(fd,"%d %d %d %d\n",&cnt,&a1,&a2,&a3);
304:       printf("%d %d %d %d\n",cnt,a1,a2,a3);
305:       if (a1 >= 0) {ja[net++] = a1;}
306:       if (a2 >= 0) {ja[net++] = a2;}
307:       if (a3 >= 0) {ja[net++] = a3;}
308:       ia[i+1] = net;
309:     }

311:     printf("ia values for processor 0\n");
312:     for (i=0; i<mlocal_ele+1; i++) {
313:       printf("%d ",ia[i]);
314:     }
315:     printf("\n");
316:     printf("ja values for processor 0\n");
317:     for (i=0; i<ia[mlocal_ele]; i++) {
318:       printf("%d ",ja[i]);
319:     }
320:     printf("\n");

322:     /*
323:        Read in element neighbor information for all other processors
324:     */
325:     PetscMalloc((mlocal_ele+1)*sizeof(PetscInt),&iatmp);
326:     PetscMalloc((3*mlocal_ele+1)*sizeof(PetscInt),&jatmp);
327:     for (j=1; j<size; j++) {
328:       net   = 0;
329:       iatmp[0] = 0;
330:       printf("Element neighbors on processor %d\n",j);
331:       for (i=0; i<mmlocal_ele[j]; i++) {
332:         fscanf(fd,"%d %d %d %d\n",&cnt,&a1,&a2,&a3);
333:         printf("%d %d %d %d\n",cnt,a1,a2,a3);
334:         if (a1 >= 0) {jatmp[net++] = a1;}
335:         if (a2 >= 0) {jatmp[net++] = a2;}
336:         if (a3 >= 0) {jatmp[net++] = a3;}
337:         iatmp[i+1] = net;
338:       }

340:       printf("ia values for processor %d\n",j);
341:       for (i=0; i<mmlocal_ele[j]+1; i++) {
342:         printf("%d ",iatmp[i]);
343:       }
344:       printf("\n");
345:       printf("ja values for processor %d\n",j);
346:       for (i=0; i<iatmp[mmlocal_ele[j]]; i++) {
347:         printf("%d ",jatmp[i]);
348:       }
349:       printf("\n");

351:       /* send graph off to appropriate processor */
352:       MPI_Send(iatmp,mmlocal_ele[j]+1,MPI_INT,j,0,PETSC_COMM_WORLD);
353:       MPI_Send(jatmp,iatmp[mmlocal_ele[j]],MPI_INT,j,0,PETSC_COMM_WORLD);
354:     }
355:     PetscFree(iatmp);
356:     PetscFree(jatmp);
357:     PetscFree(mmlocal_ele);

359:     fclose(fd);
360:   } else {
361:     /*
362:         We are not the zeroth processor so we do not open the file
363:       rather we wait for processor 0 to send us our data.
364:     */

366:     /* receive total number of vertices */
367:     MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
368:     mlocal_vert = n_vert/size + ((n_vert % size) > rank);

370:     /* receive vertices */
371:     PetscMalloc(2*(mlocal_vert+1)*sizeof(PetscScalar),&vert);
372:     MPI_Recv(vert,2*mlocal_vert,MPIU_SCALAR,0,0,PETSC_COMM_WORLD,&status);

374:     /* receive total number of elements */
375:     MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
376:     mlocal_ele = n_ele/size + ((n_ele % size) > rank);

378:     /* receive elements */
379:     PetscMalloc(3*(mlocal_ele+1)*sizeof(PetscInt),&ele);
380:     MPI_Recv(ele,3*mlocal_ele,MPI_INT,0,0,PETSC_COMM_WORLD,&status);

382:     /* receive element adjacency graph */
383:     PetscMalloc((mlocal_ele+1)*sizeof(PetscInt),&ia);
384:     MPI_Recv(ia,mlocal_ele+1,MPI_INT,0,0,PETSC_COMM_WORLD,&status);

386:     PetscMalloc((ia[mlocal_ele]+1)*sizeof(PetscInt),&ja);
387:     MPI_Recv(ja,ia[mlocal_ele],MPI_INT,0,0,PETSC_COMM_WORLD,&status);
388:   }

390:   gdata->n_vert      = n_vert;
391:   gdata->n_ele       = n_ele;
392:   gdata->mlocal_vert = mlocal_vert;
393:   gdata->mlocal_ele  = mlocal_ele;
394:   gdata->ele         = ele;
395:   gdata->vert        = vert;

397:   gdata->ia          = ia;
398:   gdata->ja          = ja;

400:   return(0);
401: }


406: /*
407:          Given the grid data spread across the processors, determines a
408:    new partitioning of the CELLS (elements) to reduce the number of cut edges between
409:    cells (elements).
410: */
411: PetscErrorCode DataPartitionElements(GridData *gdata)
412: {
413:   Mat             Adj;                /* adjacency matrix */
414:   PetscInt        *ia,*ja;
415:   PetscInt        mlocal_ele,n_ele;
416:   PetscErrorCode  ierr;
417:   MatPartitioning part;
418:   IS              isnewproc;

421:   n_ele       = gdata->n_ele;
422:   mlocal_ele  = gdata->mlocal_ele;

424:   ia          = gdata->ia;
425:   ja          = gdata->ja;

427:   /*
428:       Create the adjacency graph matrix
429:   */
430:   MatCreateMPIAdj(PETSC_COMM_WORLD,mlocal_ele,n_ele,ia,ja,PETSC_NULL,&Adj);

432:   /*
433:       Create the partioning object
434:   */
435:   MatPartitioningCreate(PETSC_COMM_WORLD,&part);
436:   MatPartitioningSetAdjacency(part,Adj);
437:   MatPartitioningSetFromOptions(part);
438:   MatPartitioningApply(part,&isnewproc);
439:   MatPartitioningDestroy(&part);

441:   /*
442:        isnewproc - indicates for each local element the new processor it is assigned to
443:   */
444:   PetscPrintf(PETSC_COMM_WORLD,"New processor assignment for each element\n");
445:   ISView(isnewproc,PETSC_VIEWER_STDOUT_WORLD);
446:   gdata->isnewproc = isnewproc;

448:   /*
449:       Free the adjacency graph data structures
450:   */
451:   MatDestroy(&Adj);


454:   return(0);
455: }

459: /*
460:       Moves the grid element data to be on the correct processor for the new
461:    element partitioning.
462: */
463: PetscErrorCode DataMoveElements(GridData *gdata)
464: {
466:   PetscInt       *counts,i,*tidx;
467:   const PetscInt *idx;
468:   PetscMPIInt    rank,size;
469:   Vec            vele,veleold;
470:   PetscScalar    *array;
471:   IS             isscat,isnum;
472:   VecScatter     vecscat;


476:   MPI_Comm_size(PETSC_COMM_WORLD,&size);
477:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

479:   /* 
480:       Determine how many elements are assigned to each processor 
481:   */
482:   PetscMalloc(size*sizeof(PetscInt),&counts);
483:   ISPartitioningCount(gdata->isnewproc,size,counts);

485:   /* 
486:      Create a vector to contain the newly ordered element information 

488:      Note: we use vectors to communicate this data since we can use the powerful
489:      VecScatter routines to get the data to the correct location. This is a little
490:      wasteful since the vectors hold double precision numbers instead of integers,
491:      but since this is just a setup phase in the entire numerical computation that 
492:      is only called once it is not a measureable performance bottleneck.
493:   */
494:   VecCreate(PETSC_COMM_WORLD,&vele);
495:   VecSetSizes(vele,3*counts[rank],PETSC_DECIDE);
496:   VecSetFromOptions(vele);

498:   /* 
499:       Create an index set from the isnewproc index set to indicate the mapping TO 
500:   */
501:   ISPartitioningToNumbering(gdata->isnewproc,&isnum);
502:   ISDestroy(&gdata->isnewproc);
503:   /* 
504:       There are three data items per cell (element), the integer vertex numbers of its three 
505:     coordinates (we convert to double to use the scatter) (one can think 
506:     of the vectors of having a block size of 3, then there is one index in idx[] for each element)
507:   */
508:   ISGetIndices(isnum,&idx);
509:   PetscMalloc(gdata->mlocal_ele*sizeof(PetscInt),&tidx);
510:   for (i=0; i<gdata->mlocal_ele; i++) {
511:     tidx[i] = idx[i];
512:   }
513:   ISCreateBlock(PETSC_COMM_WORLD,3,gdata->mlocal_ele,tidx,PETSC_COPY_VALUES,&isscat);
514:   ISRestoreIndices(isnum,&idx);
515:   PetscFree(tidx);
516:   ISDestroy(&isnum);

518:   /* 
519:      Create a vector to contain the original vertex information for each element 
520:   */
521:   VecCreateSeq(PETSC_COMM_SELF,3*gdata->mlocal_ele,&veleold);
522:   VecGetArray(veleold,&array);
523:   for (i=0; i<3*gdata->mlocal_ele; i++) {
524:     array[i] = gdata->ele[i];
525:   }
526:   VecRestoreArray(veleold,&array);
527:   /* 
528:      Scatter the element vertex information (still in the original vertex ordering) to the correct processor
529:   */
530:   VecScatterCreate(veleold,PETSC_NULL,vele,isscat,&vecscat);
531:   ISDestroy(&isscat);
532:   VecScatterBegin(vecscat,veleold,vele,INSERT_VALUES,SCATTER_FORWARD);
533:   VecScatterEnd(vecscat,veleold,vele,INSERT_VALUES,SCATTER_FORWARD);
534:   VecScatterDestroy(&vecscat);
535:   VecDestroy(&veleold);

537:   /* 
538:      Put the element vertex data into a new allocation of the gdata->ele 
539:   */
540:   PetscFree(gdata->ele);
541:   gdata->mlocal_ele = counts[rank];
542:   PetscFree(counts);
543:   PetscMalloc(3*gdata->mlocal_ele*sizeof(PetscInt),&gdata->ele);
544:   VecGetArray(vele,&array);
545:   for (i=0; i<3*gdata->mlocal_ele; i++) {
546:     gdata->ele[i] = (int)PetscRealPart(array[i]);
547:   }
548:   VecRestoreArray(vele,&array);
549:   VecDestroy(&vele);

551:   PetscPrintf(PETSC_COMM_WORLD,"Old vertex numbering in new element ordering\n");
552:   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %d\n",rank);
553:   for (i=0; i<gdata->mlocal_ele; i++) {
554:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %d\n",i,gdata->ele[3*i],gdata->ele[3*i+1],
555:                             gdata->ele[3*i+2]);
556:   }
557:   PetscSynchronizedFlush(PETSC_COMM_WORLD);

559:   return(0);
560: }

564: /*
565:          Given the newly partitioned cells (elements), this routine partitions the 
566:      vertices.

568:      The code is not completely scalable since it requires
569:      1) O(n_vert) bits per processor memory
570:      2) uses O(size) stages of communication; each of size O(n_vert) bits
571:      3) it is sequential (each processor marks it vertices ONLY after all processors
572:         to the left have marked theirs.
573:      4) the amount of work on the last processor is O(n_vert)

575:      The algorithm also does not take advantage of vertices that are "interior" to a
576:      processors elements (that is; is not contained in any element on another processor).
577:      A better algorithm would first have all processors determine "interior" vertices and
578:      make sure they are retained on that processor before listing "boundary" vertices.

580:      The algorithm is:
581:      a) each processor waits for a message from the left containing mask of all marked vertices
582:      b) it loops over all local elements, generating a list of vertices it will 
583:         claim (not claiming ones that have already been marked in the bit-array)
584:         it claims at most n_vert/size vertices
585:      c) it sends to the right the mask

587:      This is a quick-and-dirty implementation; it should work fine for many problems,
588:      but will need to be replaced once profiling shows that it takes a large amount of
589:      time. An advantage is it requires no searching or sorting.
590:      
591: */
592: PetscErrorCode DataPartitionVertices(GridData *gdata)
593: {
594:   PetscInt       n_vert = gdata->n_vert,*localvert;
595:   PetscInt       mlocal_ele = gdata->mlocal_ele,*ele = gdata->ele,i,j,nlocal = 0,nmax;
596:   PetscMPIInt    rank,size;
598:   PetscBT        mask;
599:   MPI_Status     status;

602:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
603:   MPI_Comm_size(PETSC_COMM_WORLD,&size);

605:   /*
606:       Allocated space to store bit-array indicting vertices marked
607:   */
608:   PetscBTCreate(n_vert,&mask);

610:   /*
611:      All processors except last can have a maximum of n_vert/size vertices assigned
612:      (because last processor needs to handle any leftovers)
613:   */
614:   nmax = n_vert/size;
615:   if (rank == size-1) {
616:     nmax = n_vert;
617:   }

619:   /* 
620:      Receive list of marked vertices from left 
621:   */
622:   if (rank) {
623:     MPI_Recv(mask,PetscBTLength(n_vert),MPI_CHAR,rank-1,0,PETSC_COMM_WORLD,&status);
624:   }

626:   if (rank == size-1) {
627:     /* last processor gets all the rest */
628:     for (i=0; i<n_vert; i++) {
629:       if (!PetscBTLookup(mask,i)) {
630:         nlocal++;
631:       }
632:     }
633:     nmax = nlocal;
634:   }

636:   /* 
637:      Now we know how many are local, allocated enough space for them and mark them 
638:   */
639:   PetscMalloc((nmax+1)*sizeof(PetscInt),&localvert);

641:   /* generate local list and fill in mask */
642:   nlocal = 0;
643:   if (rank < size-1) {
644:     /* count my vertices */
645:     for (i=0; i<mlocal_ele; i++) {
646:       for (j=0; j<3; j++) {
647:         if (!PetscBTLookupSet(mask,ele[3*i+j])) {
648:           localvert[nlocal++] = ele[3*i+j];
649:           if (nlocal >= nmax) goto foundenough2;
650:         }
651:       }
652:     }
653:     foundenough2:;
654:   } else {
655:     /* last processor gets all the rest */
656:     for (i=0; i<n_vert; i++) {
657:       if (!PetscBTLookup(mask,i)) {
658:         localvert[nlocal++] = i;
659:       }
660:     }
661:   }
662:   /* 
663:       Send bit mask on to next processor
664:   */
665:   if (rank < size-1) {
666:     MPI_Send(mask,PetscBTLength(n_vert),MPI_CHAR,rank+1,0,PETSC_COMM_WORLD);
667:   }
668:   PetscBTDestroy(&mask);

670:   gdata->localvert = localvert;
671:   gdata->nlocal    = nlocal;

673:   /* print lists of owned vertices */
674:   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] Number vertices assigned %d\n",rank,nlocal);
675:   PetscSynchronizedFlush(PETSC_COMM_WORLD);
676:   PetscIntView(nlocal,localvert,PETSC_VIEWER_STDOUT_WORLD);

678:   return(0);
679: }

683: /*
684:      Given the partitioning of the vertices; renumbers the element vertex lists for the 
685:      new vertex numbering and moves the vertex coordinate values to the correct processor
686: */
687: PetscErrorCode DataMoveVertices(GridData *gdata)
688: {
689:   AO             ao;
691:   PetscMPIInt    rank;
692:   PetscInt       i;
693:   Vec            vert,overt;
694:   VecScatter     vecscat;
695:   IS             isscat;
696:   PetscScalar    *avert;

699:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

701:   /* ---------------------------------------------------------------------
702:       Create a global reodering of the vertex numbers
703:   */
704:   AOCreateBasic(PETSC_COMM_WORLD,gdata->nlocal,gdata->localvert,PETSC_NULL,&ao);

706:   /*
707:      Change the element vertex information to the new vertex numbering
708:   */
709:   AOApplicationToPetsc(ao,3*gdata->mlocal_ele,gdata->ele);
710:   PetscPrintf(PETSC_COMM_WORLD,"New vertex numbering in new element ordering\n");
711:   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %d\n",rank);
712:   for (i=0; i<gdata->mlocal_ele; i++) {
713:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %d\n",i,gdata->ele[3*i],gdata->ele[3*i+1],
714:                             gdata->ele[3*i+2]);
715:   }
716:   PetscSynchronizedFlush(PETSC_COMM_WORLD);

718:   /*
719:      Destroy the AO that is no longer needed
720:   */
721:   AODestroy(&ao);

723:   /* --------------------------------------------------------------------
724:       Finally ship the vertex coordinate information to its owning process
725:       note, we do this in a way very similar to what was done for the element info
726:   */
727:   /* create a vector to contain the newly ordered vertex information */
728:   VecCreateSeq(PETSC_COMM_SELF,2*gdata->nlocal,&vert);

730:   /* create a vector to contain the old ordered vertex information */
731:   VecCreateMPIWithArray(PETSC_COMM_WORLD,1,2*gdata->mlocal_vert,PETSC_DECIDE,gdata->vert,&overt);

733:   /* 
734:       There are two data items per vertex, the x and y coordinates (i.e. one can think 
735:     of the vectors of having a block size of 2 and there is one index in localvert[] for each block)
736:   */
737:   ISCreateBlock(PETSC_COMM_WORLD,2,gdata->nlocal,gdata->localvert,PETSC_COPY_VALUES,&isscat);
738:   PetscFree(gdata->localvert);

740:   /* 
741:       Scatter the element vertex information to the correct processor
742:   */
743:   VecScatterCreate(overt,isscat,vert,PETSC_NULL,&vecscat);
744:   ISDestroy(&isscat);
745:   VecScatterBegin(vecscat,overt,vert,INSERT_VALUES,SCATTER_FORWARD);
746:   VecScatterEnd(vecscat,overt,vert,INSERT_VALUES,SCATTER_FORWARD);
747:   VecScatterDestroy(&vecscat);

749:   VecDestroy(&overt);
750:   PetscFree(gdata->vert);
751: 
752:   /*
753:         Put resulting vertex information into gdata->vert array
754:   */
755:   PetscMalloc(2*gdata->nlocal*sizeof(PetscScalar),&gdata->vert);
756:   VecGetArray(vert,&avert);
757:   PetscMemcpy(gdata->vert,avert,2*gdata->nlocal*sizeof(PetscScalar));
758:   VecRestoreArray(vert,&avert);
759:   gdata->mlocal_vert = gdata->nlocal;
760:   VecDestroy(&vert);

762:   PetscPrintf(PETSC_COMM_WORLD,"Vertex coordinates in new numbering\n");
763:   for (i=0; i<gdata->mlocal_vert; i++) {
764:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"(%g,%g)\n",gdata->vert[2*i],gdata->vert[2*i+1]);
765:   }
766:   PetscSynchronizedFlush(PETSC_COMM_WORLD);

768:   return(0);
769: }


774: PetscErrorCode DataDestroy(GridData *gdata)
775: {

779:   PetscFree(gdata->ele);
780:   PetscFree(gdata->vert);
781:   return(0);
782: }