Actual source code: gs.c

  1: /***********************************gs.c***************************************

  3: Author: Henry M. Tufo III

  5: e-mail: hmt@cs.brown.edu

  7: snail-mail:
  8: Division of Applied Mathematics
  9: Brown University
 10: Providence, RI 02912

 12: Last Modification:
 13: 6.21.97
 14: ************************************gs.c**************************************/

 16: /***********************************gs.c***************************************
 17: File Description:
 18: -----------------

 20: ************************************gs.c**************************************/

 22: #include <../src/ksp/pc/impls/tfs/tfs.h>

 24: /* default length of number of items via tree - doubles if exceeded */
 25: #define TREE_BUF_SZ 2048
 26: #define GS_VEC_SZ   1

 28: /***********************************gs.c***************************************
 29: Type: struct gather_scatter_id
 30: ------------------------------

 32: ************************************gs.c**************************************/
 33: typedef struct gather_scatter_id {
 34:   PetscInt     id;
 35:   PetscInt     nel_min;
 36:   PetscInt     nel_max;
 37:   PetscInt     nel_sum;
 38:   PetscInt     negl;
 39:   PetscInt     gl_max;
 40:   PetscInt     gl_min;
 41:   PetscInt     repeats;
 42:   PetscInt     ordered;
 43:   PetscInt     positive;
 44:   PetscScalar *vals;

 46:   /* bit mask info */
 47:   PetscInt *my_proc_mask;
 48:   PetscInt  mask_sz;
 49:   PetscInt *ngh_buf;
 50:   PetscInt  ngh_buf_sz;
 51:   PetscInt *nghs;
 52:   PetscInt  num_nghs;
 53:   PetscInt  max_nghs;
 54:   PetscInt *pw_nghs;
 55:   PetscInt  num_pw_nghs;
 56:   PetscInt *tree_nghs;
 57:   PetscInt  num_tree_nghs;

 59:   PetscInt num_loads;

 61:   /* repeats == true -> local info */
 62:   PetscInt  nel;  /* number of unique elements */
 63:   PetscInt *elms; /* of size nel */
 64:   PetscInt  nel_total;
 65:   PetscInt *local_elms; /* of size nel_total */
 66:   PetscInt *companion;  /* of size nel_total */

 68:   /* local info */
 69:   PetscInt   num_local_total;
 70:   PetscInt   local_strength;
 71:   PetscInt   num_local;
 72:   PetscInt  *num_local_reduce;
 73:   PetscInt **local_reduce;
 74:   PetscInt   num_local_gop;
 75:   PetscInt  *num_gop_local_reduce;
 76:   PetscInt **gop_local_reduce;

 78:   /* pairwise info */
 79:   PetscInt     level;
 80:   PetscInt     num_pairs;
 81:   PetscInt     max_pairs;
 82:   PetscInt     loc_node_pairs;
 83:   PetscInt     max_node_pairs;
 84:   PetscInt     min_node_pairs;
 85:   PetscInt     avg_node_pairs;
 86:   PetscInt    *pair_list;
 87:   PetscInt    *msg_sizes;
 88:   PetscInt   **node_list;
 89:   PetscInt     len_pw_list;
 90:   PetscInt    *pw_elm_list;
 91:   PetscScalar *pw_vals;

 93:   MPI_Request *msg_ids_in;
 94:   MPI_Request *msg_ids_out;

 96:   PetscScalar *out;
 97:   PetscScalar *in;
 98:   PetscInt     msg_total;

100:   /* tree - crystal accumulator info */
101:   PetscInt   max_left_over;
102:   PetscInt  *pre;
103:   PetscInt  *in_num;
104:   PetscInt  *out_num;
105:   PetscInt **in_list;
106:   PetscInt **out_list;

108:   /* new tree work*/
109:   PetscInt     tree_nel;
110:   PetscInt    *tree_elms;
111:   PetscScalar *tree_buf;
112:   PetscScalar *tree_work;

114:   PetscInt  tree_map_sz;
115:   PetscInt *tree_map_in;
116:   PetscInt *tree_map_out;

118:   /* current memory status */
119:   PetscInt gl_bss_min;
120:   PetscInt gl_perm_min;

122:   /* max segment size for PCTFS_gs_gop_vec() */
123:   PetscInt vec_sz;

125:   /* hack to make paul happy */
126:   MPI_Comm PCTFS_gs_comm;

128: } PCTFS_gs_id;

130: static PCTFS_gs_id   *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
131: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
132: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
133: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
134: static PCTFS_gs_id   *gsi_new(void);
135: static PetscErrorCode set_tree(PCTFS_gs_id *gs);

137: /* same for all but vector flavor */
138: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
139: /* vector flavor */
140: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

142: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
143: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

148: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
149: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);

151: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
152: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);

155: /* global vars */
156: /* from comm.c module */

158: static PetscInt num_gs_ids = 0;

160: /* should make this dynamic ... later */
161: static PetscInt  msg_buf     = MAX_MSG_BUF;
162: static PetscInt  vec_sz      = GS_VEC_SZ;
163: static PetscInt *tree_buf    = NULL;
164: static PetscInt  tree_buf_sz = 0;
165: static PetscInt  ntree       = 0;

167: /***************************************************************************/
168: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
169: {
170:   PetscFunctionBegin;
171:   vec_sz = size;
172:   PetscFunctionReturn(PETSC_SUCCESS);
173: }

175: /******************************************************************************/
176: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
177: {
178:   PetscFunctionBegin;
179:   msg_buf = buf_size;
180:   PetscFunctionReturn(PETSC_SUCCESS);
181: }

183: /******************************************************************************/
184: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
185: {
186:   PCTFS_gs_id *gs;
187:   MPI_Group    PCTFS_gs_group;
188:   MPI_Comm     PCTFS_gs_comm;

190:   /* ensure that communication package has been initialized */
191:   PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init());

193:   /* determines if we have enough dynamic/semi-static memory */
194:   /* checks input, allocs and sets gd_id template            */
195:   gs = gsi_check_args(elms, nel, level);

197:   /* only bit mask version up and working for the moment    */
198:   /* LATER :: get int list version working for sparse pblms */
199:   PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));

201:   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
202:   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS);
203:   PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);

205:   gs->PCTFS_gs_comm = PCTFS_gs_comm;

207:   return gs;
208: }

210: /******************************************************************************/
211: static PCTFS_gs_id *gsi_new(void)
212: {
213:   PCTFS_gs_id *gs;
214:   gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
215:   PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
216:   return gs;
217: }

219: /******************************************************************************/
220: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
221: {
222:   PetscInt     i, j, k, t2;
223:   PetscInt    *companion, *elms, *unique, *iptr;
224:   PetscInt     num_local = 0, *num_to_reduce, **local_reduce;
225:   PetscInt     oprs[]    = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
226:   PetscInt     vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
227:   PetscInt     work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
228:   PCTFS_gs_id *gs;

230:   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
231:   if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");

233:   if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));

235:   /* get space for gs template */
236:   gs     = gsi_new();
237:   gs->id = ++num_gs_ids;

239:   /* hmt 6.4.99                                            */
240:   /* caller can set global ids that don't participate to 0 */
241:   /* PCTFS_gs_init ignores all zeros in elm list                 */
242:   /* negative global ids are still invalid                 */
243:   for (i = j = 0; i < nel; i++) {
244:     if (in_elms[i] != 0) j++;
245:   }

247:   k   = nel;
248:   nel = j;

250:   /* copy over in_elms list and create inverse map */
251:   elms      = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
252:   companion = (PetscInt *)malloc(nel * sizeof(PetscInt));

254:   for (i = j = 0; i < k; i++) {
255:     if (in_elms[i] != 0) {
256:       elms[j]        = in_elms[i];
257:       companion[j++] = i;
258:     }
259:   }

261:   if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");

263:   /* pre-pass ... check to see if sorted */
264:   elms[nel] = INT_MAX;
265:   iptr      = elms;
266:   unique    = elms + 1;
267:   j         = 0;
268:   while (*iptr != INT_MAX) {
269:     if (*iptr++ > *unique++) {
270:       j = 1;
271:       break;
272:     }
273:   }

275:   /* set up inverse map */
276:   if (j) {
277:     PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
278:     PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
279:   } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
280:   elms[nel] = INT_MIN;

282:   /* first pass */
283:   /* determine number of unique elements, check pd */
284:   for (i = k = 0; i < nel; i += j) {
285:     t2 = elms[i];
286:     j  = ++i;

288:     /* clump 'em for now */
289:     while (elms[j] == t2) j++;

291:     /* how many together and num local */
292:     if (j -= i) {
293:       num_local++;
294:       k += j;
295:     }
296:   }

298:   /* how many unique elements? */
299:   gs->repeats = k;
300:   gs->nel     = nel - k;

302:   /* number of repeats? */
303:   gs->num_local = num_local;
304:   num_local += 2;
305:   gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
306:   gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));

308:   unique         = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
309:   gs->elms       = unique;
310:   gs->nel_total  = nel;
311:   gs->local_elms = elms;
312:   gs->companion  = companion;

314:   /* compess map as well as keep track of local ops */
315:   for (num_local = i = j = 0; i < gs->nel; i++) {
316:     k  = j;
317:     t2 = unique[i] = elms[j];
318:     companion[i]   = companion[j];

320:     while (elms[j] == t2) j++;

322:     if ((t2 = (j - k)) > 1) {
323:       /* number together */
324:       num_to_reduce[num_local] = t2++;

326:       iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));

328:       /* to use binary searching don't remap until we check intersection */
329:       *iptr++ = i;

331:       /* note that we're skipping the first one */
332:       while (++k < j) *(iptr++) = companion[k];
333:       *iptr = -1;
334:     }
335:   }

337:   /* sentinel for ngh_buf */
338:   unique[gs->nel] = INT_MAX;

340:   /* for two partition sort hack */
341:   num_to_reduce[num_local]   = 0;
342:   local_reduce[num_local]    = NULL;
343:   num_to_reduce[++num_local] = 0;
344:   local_reduce[num_local]    = NULL;

346:   /* load 'em up */
347:   /* note one extra to hold NON_UNIFORM flag!!! */
348:   vals[2] = vals[1] = vals[0] = nel;
349:   if (gs->nel > 0) {
350:     vals[3] = unique[0];
351:     vals[4] = unique[gs->nel - 1];
352:   } else {
353:     vals[3] = INT_MAX;
354:     vals[4] = INT_MIN;
355:   }
356:   vals[5] = level;
357:   vals[6] = num_gs_ids;

359:   /* GLOBAL: send 'em out */
360:   PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));

362:   /* must be semi-pos def - only pairwise depends on this */
363:   /* LATER - remove this restriction */
364:   if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
365:   if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");

367:   gs->nel_min = vals[0];
368:   gs->nel_max = vals[1];
369:   gs->nel_sum = vals[2];
370:   gs->gl_min  = vals[3];
371:   gs->gl_max  = vals[4];
372:   gs->negl    = vals[4] - vals[3] + 1;

374:   if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);

376:   /* LATER :: add level == -1 -> program selects level */
377:   if (vals[5] < 0) vals[5] = 0;
378:   else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
379:   gs->level = vals[5];

381:   return gs;
382: }

384: /******************************************************************************/
385: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
386: {
387:   PetscInt   i, nel, *elms;
388:   PetscInt   t1;
389:   PetscInt **reduce;
390:   PetscInt  *map;

392:   PetscFunctionBegin;
393:   /* totally local removes ... PCTFS_ct_bits == 0 */
394:   PetscCall(get_ngh_buf(gs));

396:   if (gs->level) PetscCall(set_pairwise(gs));
397:   if (gs->max_left_over) PetscCall(set_tree(gs));

399:   /* intersection local and pairwise/tree? */
400:   gs->num_local_total      = gs->num_local;
401:   gs->gop_local_reduce     = gs->local_reduce;
402:   gs->num_gop_local_reduce = gs->num_local_reduce;

404:   map = gs->companion;

406:   /* is there any local compression */
407:   if (!gs->num_local) {
408:     gs->local_strength = NONE;
409:     gs->num_local_gop  = 0;
410:   } else {
411:     /* ok find intersection */
412:     map    = gs->companion;
413:     reduce = gs->local_reduce;
414:     for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
415:       if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
416:         t1++;
417:         PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
418:         gs->num_local_reduce[i] *= -1;
419:       }
420:       **reduce = map[**reduce];
421:     }

423:     /* intersection is empty */
424:     if (!t1) {
425:       gs->local_strength = FULL;
426:       gs->num_local_gop  = 0;
427:     } else { /* intersection not empty */
428:       gs->local_strength = PARTIAL;

430:       PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));

432:       gs->num_local_gop   = t1;
433:       gs->num_local_total = gs->num_local;
434:       gs->num_local -= t1;
435:       gs->gop_local_reduce     = gs->local_reduce;
436:       gs->num_gop_local_reduce = gs->num_local_reduce;

438:       for (i = 0; i < t1; i++) {
439:         PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
440:         gs->num_gop_local_reduce[i] *= -1;
441:         gs->local_reduce++;
442:         gs->num_local_reduce++;
443:       }
444:       gs->local_reduce++;
445:       gs->num_local_reduce++;
446:     }
447:   }

449:   elms = gs->pw_elm_list;
450:   nel  = gs->len_pw_list;
451:   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];

453:   elms = gs->tree_map_in;
454:   nel  = gs->tree_map_sz;
455:   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];

457:   /* clean up */
458:   free((void *)gs->local_elms);
459:   free((void *)gs->companion);
460:   free((void *)gs->elms);
461:   free((void *)gs->ngh_buf);
462:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
463:   PetscFunctionReturn(PETSC_SUCCESS);
464: }

466: /******************************************************************************/
467: static PetscErrorCode place_in_tree(PetscInt elm)
468: {
469:   PetscInt *tp, n;

471:   PetscFunctionBegin;
472:   if (ntree == tree_buf_sz) {
473:     if (tree_buf_sz) {
474:       tp = tree_buf;
475:       n  = tree_buf_sz;
476:       tree_buf_sz <<= 1;
477:       tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
478:       PCTFS_ivec_copy(tree_buf, tp, n);
479:       free(tp);
480:     } else {
481:       tree_buf_sz = TREE_BUF_SZ;
482:       tree_buf    = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
483:     }
484:   }

486:   tree_buf[ntree++] = elm;
487:   PetscFunctionReturn(PETSC_SUCCESS);
488: }

490: /******************************************************************************/
491: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
492: {
493:   PetscInt  i, j, npw = 0, ntree_map = 0;
494:   PetscInt  p_mask_size, ngh_buf_size, buf_size;
495:   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
496:   PetscInt *ngh_buf, *buf1, *buf2;
497:   PetscInt  offset, per_load, num_loads, or_ct, start, end;
498:   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
499:   PetscInt  oper = GL_B_OR;
500:   PetscInt *ptr3, *t_mask, level, ct1, ct2;

502:   PetscFunctionBegin;
503:   /* to make life easier */
504:   nel   = gs->nel;
505:   elms  = gs->elms;
506:   level = gs->level;

508:   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
509:   p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
510:   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));

512:   /* allocate space for masks and info bufs */
513:   gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
514:   gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
515:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
516:   t_mask                        = (PetscInt *)malloc(p_mask_size);
517:   gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);

519:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
520:   /* had thought I could exploit rendezvous threshold */

522:   /* default is one pass */
523:   per_load = negl = gs->negl;
524:   gs->num_loads = num_loads = 1;
525:   i                         = p_mask_size * negl;

527:   /* possible overflow on buffer size */
528:   /* overflow hack                    */
529:   if (i < 0) i = INT_MAX;

531:   buf_size = PetscMin(msg_buf, i);

533:   /* can we do it? */
534:   PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);

536:   /* get PCTFS_giop buf space ... make *only* one malloc */
537:   buf1 = (PetscInt *)malloc(buf_size << 1);

539:   /* more than one gior exchange needed? */
540:   if (buf_size != i) {
541:     per_load      = buf_size / p_mask_size;
542:     buf_size      = per_load * p_mask_size;
543:     gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
544:   }

546:   /* convert buf sizes from #bytes to #ints - 32-bit only! */
547:   p_mask_size /= sizeof(PetscInt);
548:   ngh_buf_size /= sizeof(PetscInt);
549:   buf_size /= sizeof(PetscInt);

551:   /* find PCTFS_giop work space */
552:   buf2 = buf1 + buf_size;

554:   /* hold #ints needed for processor masks */
555:   gs->mask_sz = p_mask_size;

557:   /* init buffers */
558:   PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
559:   PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
560:   PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));

562:   /* HACK reset tree info */
563:   tree_buf    = NULL;
564:   tree_buf_sz = ntree = 0;

566:   /* ok do it */
567:   for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
568:     /* identity for bitwise or is 000...000 */
569:     PetscCall(PCTFS_ivec_zero(buf1, buf_size));

571:     /* load msg buffer */
572:     for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
573:       offset = (offset - start) * p_mask_size;
574:       PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
575:     }

577:     /* GLOBAL: pass buffer */
578:     PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));

580:     /* unload buffer into ngh_buf */
581:     ptr2 = (elms + i_start);
582:     for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
583:       /* I own it ... may have to pairwise it */
584:       if (j == *ptr2) {
585:         /* do i share it w/anyone? */
586:         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
587:         /* guess not */
588:         if (ct1 < 2) {
589:           ptr2++;
590:           ptr1 += p_mask_size;
591:           continue;
592:         }

594:         /* i do ... so keep info and turn off my bit */
595:         PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
596:         PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
597:         PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));

599:         /* is it to be done pairwise? */
600:         if (--ct1 <= level) {
601:           npw++;

603:           /* turn on high bit to indicate pw need to process */
604:           *ptr2++ |= TOP_BIT;
605:           PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
606:           ptr1 += p_mask_size;
607:           continue;
608:         }

610:         /* get set for next and note that I have a tree contribution */
611:         /* could save exact elm index for tree here -> save a search */
612:         ptr2++;
613:         ptr1 += p_mask_size;
614:         ntree_map++;
615:       } else { /* i don't but still might be involved in tree */

617:         /* shared by how many? */
618:         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));

620:         /* none! */
621:         if (ct1 < 2) continue;

623:         /* is it going to be done pairwise? but not by me of course!*/
624:         if (--ct1 <= level) continue;
625:       }
626:       /* LATER we're going to have to process it NOW */
627:       /* nope ... tree it */
628:       PetscCall(place_in_tree(j));
629:     }
630:   }

632:   free((void *)t_mask);
633:   free((void *)buf1);

635:   gs->len_pw_list = npw;
636:   gs->num_nghs    = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));

638:   /* expand from bit mask list to int list and save ngh list */
639:   gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
640:   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs));

642:   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));

644:   oper = GL_MAX;
645:   ct1  = gs->num_nghs;
646:   PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
647:   gs->max_nghs = ct1;

649:   gs->tree_map_sz   = ntree_map;
650:   gs->max_left_over = ntree;

652:   free((void *)p_mask);
653:   free((void *)sh_proc_mask);
654:   PetscFunctionReturn(PETSC_SUCCESS);
655: }

657: /******************************************************************************/
658: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
659: {
660:   PetscInt  i, j;
661:   PetscInt  p_mask_size;
662:   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
663:   PetscInt *ngh_buf, *buf2;
664:   PetscInt  offset;
665:   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
666:   PetscInt *pairwise_elm_list, len_pair_list = 0;
667:   PetscInt *iptr, t1, i_start, nel, *elms;
668:   PetscInt  ct;

670:   PetscFunctionBegin;
671:   /* to make life easier */
672:   nel          = gs->nel;
673:   elms         = gs->elms;
674:   ngh_buf      = gs->ngh_buf;
675:   sh_proc_mask = gs->pw_nghs;

677:   /* need a few temp masks */
678:   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
679:   p_mask        = (PetscInt *)malloc(p_mask_size);
680:   tmp_proc_mask = (PetscInt *)malloc(p_mask_size);

682:   /* set mask to my PCTFS_my_id's bit mask */
683:   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));

685:   p_mask_size /= sizeof(PetscInt);

687:   len_pair_list   = gs->len_pw_list;
688:   gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));

690:   /* how many processors (nghs) do we have to exchange with? */
691:   nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));

693:   /* allocate space for PCTFS_gs_gop() info */
694:   gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
695:   gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
696:   gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));

698:   /* init msg_size list */
699:   PetscCall(PCTFS_ivec_zero(msg_size, nprs));

701:   /* expand from bit mask list to int list */
702:   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));

704:   /* keep list of elements being handled pairwise */
705:   for (i = j = 0; i < nel; i++) {
706:     if (elms[i] & TOP_BIT) {
707:       elms[i] ^= TOP_BIT;
708:       pairwise_elm_list[j++] = i;
709:     }
710:   }
711:   pairwise_elm_list[j] = -1;

713:   gs->msg_ids_out       = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
714:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
715:   gs->msg_ids_in        = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
716:   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
717:   gs->pw_vals           = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);

719:   /* find who goes to each processor */
720:   for (i_start = i = 0; i < nprs; i++) {
721:     /* processor i's mask */
722:     PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));

724:     /* det # going to processor i */
725:     for (ct = j = 0; j < len_pair_list; j++) {
726:       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
727:       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
728:       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
729:     }
730:     msg_size[i] = ct;
731:     i_start     = PetscMax(i_start, ct);

733:     /*space to hold nodes in message to first neighbor */
734:     msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));

736:     for (j = 0; j < len_pair_list; j++) {
737:       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
738:       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
739:       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
740:     }
741:     *iptr = -1;
742:   }
743:   msg_nodes[nprs] = NULL;

745:   j = gs->loc_node_pairs = i_start;
746:   t1                     = GL_MAX;
747:   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
748:   gs->max_node_pairs = i_start;

750:   i_start = j;
751:   t1      = GL_MIN;
752:   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
753:   gs->min_node_pairs = i_start;

755:   i_start = j;
756:   t1      = GL_ADD;
757:   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
758:   gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;

760:   i_start = nprs;
761:   t1      = GL_MAX;
762:   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
763:   gs->max_pairs = i_start;

765:   /* remap pairwise in tail of gsi_via_bit_mask() */
766:   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
767:   gs->out       = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
768:   gs->in        = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);

770:   /* reset malloc pool */
771:   free((void *)p_mask);
772:   free((void *)tmp_proc_mask);
773:   PetscFunctionReturn(PETSC_SUCCESS);
774: }

776: /* to do pruned tree just save ngh buf copy for each one and decode here!
777: ******************************************************************************/
778: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
779: {
780:   PetscInt  i, j, n, nel;
781:   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

783:   PetscFunctionBegin;
784:   /* local work ptrs */
785:   elms = gs->elms;
786:   nel  = gs->nel;

788:   /* how many via tree */
789:   gs->tree_nel = n = ntree;
790:   gs->tree_elms = tree_elms = iptr_in = tree_buf;
791:   gs->tree_buf                        = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
792:   gs->tree_work                       = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
793:   j                                   = gs->tree_map_sz;
794:   gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
795:   gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));

797:   /* search the longer of the two lists */
798:   /* note ... could save this info in get_ngh_buf and save searches */
799:   if (n <= nel) {
800:     /* bijective fct w/remap - search elm list */
801:     for (i = 0; i < n; i++) {
802:       if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
803:         *iptr_in++  = j;
804:         *iptr_out++ = i;
805:       }
806:     }
807:   } else {
808:     for (i = 0; i < nel; i++) {
809:       if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
810:         *iptr_in++  = i;
811:         *iptr_out++ = j;
812:       }
813:     }
814:   }

816:   /* sentinel */
817:   *iptr_in = *iptr_out = -1;
818:   PetscFunctionReturn(PETSC_SUCCESS);
819: }

821: /******************************************************************************/
822: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
823: {
824:   PetscInt   *num, *map, **reduce;
825:   PetscScalar tmp;

827:   PetscFunctionBegin;
828:   num    = gs->num_gop_local_reduce;
829:   reduce = gs->gop_local_reduce;
830:   while ((map = *reduce++)) {
831:     /* wall */
832:     if (*num == 2) {
833:       num++;
834:       vals[map[1]] = vals[map[0]];
835:     } else if (*num == 3) { /* corner shared by three elements */
836:       num++;
837:       vals[map[2]] = vals[map[1]] = vals[map[0]];
838:     } else if (*num == 4) { /* corner shared by four elements */
839:       num++;
840:       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
841:     } else { /* general case ... odd geoms ... 3D*/
842:       num++;
843:       tmp = *(vals + *map++);
844:       while (*map >= 0) *(vals + *map++) = tmp;
845:     }
846:   }
847:   PetscFunctionReturn(PETSC_SUCCESS);
848: }

850: /******************************************************************************/
851: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
852: {
853:   PetscInt   *num, *map, **reduce;
854:   PetscScalar tmp;

856:   PetscFunctionBegin;
857:   num    = gs->num_local_reduce;
858:   reduce = gs->local_reduce;
859:   while ((map = *reduce)) {
860:     /* wall */
861:     if (*num == 2) {
862:       num++;
863:       reduce++;
864:       vals[map[1]] = vals[map[0]] += vals[map[1]];
865:     } else if (*num == 3) { /* corner shared by three elements */
866:       num++;
867:       reduce++;
868:       vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
869:     } else if (*num == 4) { /* corner shared by four elements */
870:       num++;
871:       reduce++;
872:       vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
873:     } else { /* general case ... odd geoms ... 3D*/
874:       num++;
875:       tmp = 0.0;
876:       while (*map >= 0) tmp += *(vals + *map++);

878:       map = *reduce++;
879:       while (*map >= 0) *(vals + *map++) = tmp;
880:     }
881:   }
882:   PetscFunctionReturn(PETSC_SUCCESS);
883: }

885: /******************************************************************************/
886: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
887: {
888:   PetscInt    *num, *map, **reduce;
889:   PetscScalar *base;

891:   PetscFunctionBegin;
892:   num    = gs->num_gop_local_reduce;
893:   reduce = gs->gop_local_reduce;
894:   while ((map = *reduce++)) {
895:     /* wall */
896:     if (*num == 2) {
897:       num++;
898:       vals[map[0]] += vals[map[1]];
899:     } else if (*num == 3) { /* corner shared by three elements */
900:       num++;
901:       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
902:     } else if (*num == 4) { /* corner shared by four elements */
903:       num++;
904:       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
905:     } else { /* general case ... odd geoms ... 3D*/
906:       num++;
907:       base = vals + *map++;
908:       while (*map >= 0) *base += *(vals + *map++);
909:     }
910:   }
911:   PetscFunctionReturn(PETSC_SUCCESS);
912: }

914: /******************************************************************************/
915: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
916: {
917:   PetscInt i;

919:   PetscFunctionBegin;
920:   PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
921:   if (gs->nghs) free((void *)gs->nghs);
922:   if (gs->pw_nghs) free((void *)gs->pw_nghs);

924:   /* tree */
925:   if (gs->max_left_over) {
926:     if (gs->tree_elms) free((void *)gs->tree_elms);
927:     if (gs->tree_buf) free((void *)gs->tree_buf);
928:     if (gs->tree_work) free((void *)gs->tree_work);
929:     if (gs->tree_map_in) free((void *)gs->tree_map_in);
930:     if (gs->tree_map_out) free((void *)gs->tree_map_out);
931:   }

933:   /* pairwise info */
934:   if (gs->num_pairs) {
935:     /* should be NULL already */
936:     if (gs->ngh_buf) free((void *)gs->ngh_buf);
937:     if (gs->elms) free((void *)gs->elms);
938:     if (gs->local_elms) free((void *)gs->local_elms);
939:     if (gs->companion) free((void *)gs->companion);

941:     /* only set if pairwise */
942:     if (gs->vals) free((void *)gs->vals);
943:     if (gs->in) free((void *)gs->in);
944:     if (gs->out) free((void *)gs->out);
945:     if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
946:     if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
947:     if (gs->pw_vals) free((void *)gs->pw_vals);
948:     if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
949:     if (gs->node_list) {
950:       for (i = 0; i < gs->num_pairs; i++) {
951:         if (gs->node_list[i]) free((void *)gs->node_list[i]);
952:       }
953:       free((void *)gs->node_list);
954:     }
955:     if (gs->msg_sizes) free((void *)gs->msg_sizes);
956:     if (gs->pair_list) free((void *)gs->pair_list);
957:   }

959:   /* local info */
960:   if (gs->num_local_total >= 0) {
961:     for (i = 0; i < gs->num_local_total + 1; i++) {
962:       if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
963:     }
964:   }

966:   /* if intersection tree/pairwise and local isn't empty */
967:   if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
968:   if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);

970:   free((void *)gs);
971:   PetscFunctionReturn(PETSC_SUCCESS);
972: }

974: /******************************************************************************/
975: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
976: {
977:   PetscFunctionBegin;
978:   switch (*op) {
979:   case '+':
980:     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
981:     break;
982:   default:
983:     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
984:     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
985:     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
986:     break;
987:   }
988:   PetscFunctionReturn(PETSC_SUCCESS);
989: }

991: /******************************************************************************/
992: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
993: {
994:   PetscFunctionBegin;
995:   PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");

997:   /* local only operations!!! */
998:   if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step));

1000:   /* if intersection tree/pairwise and local isn't empty */
1001:   if (gs->num_local_gop) {
1002:     PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step));

1004:     /* pairwise */
1005:     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));

1007:     /* tree */
1008:     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));

1010:     PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step));
1011:   } else { /* if intersection tree/pairwise and local is empty */
1012:     /* pairwise */
1013:     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));

1015:     /* tree */
1016:     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1017:   }
1018:   PetscFunctionReturn(PETSC_SUCCESS);
1019: }

1021: /******************************************************************************/
1022: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1023: {
1024:   PetscInt    *num, *map, **reduce;
1025:   PetscScalar *base;

1027:   PetscFunctionBegin;
1028:   num    = gs->num_local_reduce;
1029:   reduce = gs->local_reduce;
1030:   while ((map = *reduce)) {
1031:     base = vals + map[0] * step;

1033:     /* wall */
1034:     if (*num == 2) {
1035:       num++;
1036:       reduce++;
1037:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1038:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1039:     } else if (*num == 3) { /* corner shared by three elements */
1040:       num++;
1041:       reduce++;
1042:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1043:       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1044:       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1045:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1046:     } else if (*num == 4) { /* corner shared by four elements */
1047:       num++;
1048:       reduce++;
1049:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1050:       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1051:       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1052:       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1053:       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1054:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1055:     } else { /* general case ... odd geoms ... 3D */
1056:       num++;
1057:       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));

1059:       map = *reduce;
1060:       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));

1062:       reduce++;
1063:     }
1064:   }
1065:   PetscFunctionReturn(PETSC_SUCCESS);
1066: }

1068: /******************************************************************************/
1069: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1070: {
1071:   PetscInt    *num, *map, **reduce;
1072:   PetscScalar *base;

1074:   PetscFunctionBegin;
1075:   num    = gs->num_gop_local_reduce;
1076:   reduce = gs->gop_local_reduce;
1077:   while ((map = *reduce++)) {
1078:     base = vals + map[0] * step;

1080:     /* wall */
1081:     if (*num == 2) {
1082:       num++;
1083:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1084:     } else if (*num == 3) { /* corner shared by three elements */
1085:       num++;
1086:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1087:       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1088:     } else if (*num == 4) { /* corner shared by four elements */
1089:       num++;
1090:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1091:       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1092:       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1093:     } else { /* general case ... odd geoms ... 3D*/
1094:       num++;
1095:       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1096:     }
1097:   }
1098:   PetscFunctionReturn(PETSC_SUCCESS);
1099: }

1101: /******************************************************************************/
1102: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1103: {
1104:   PetscInt    *num, *map, **reduce;
1105:   PetscScalar *base;

1107:   PetscFunctionBegin;
1108:   num    = gs->num_gop_local_reduce;
1109:   reduce = gs->gop_local_reduce;
1110:   while ((map = *reduce++)) {
1111:     base = vals + map[0] * step;

1113:     /* wall */
1114:     if (*num == 2) {
1115:       num++;
1116:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1117:     } else if (*num == 3) { /* corner shared by three elements */
1118:       num++;
1119:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1120:       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1121:     } else if (*num == 4) { /* corner shared by four elements */
1122:       num++;
1123:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1124:       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1125:       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1126:     } else { /* general case ... odd geoms ... 3D*/
1127:       num++;
1128:       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1129:     }
1130:   }
1131:   PetscFunctionReturn(PETSC_SUCCESS);
1132: }

1134: /******************************************************************************/
1135: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1136: {
1137:   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1138:   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1139:   PetscInt    *pw, *list, *size, **nodes;
1140:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1141:   MPI_Status   status;
1142:   PetscBLASInt i1 = 1, dstep;

1144:   PetscFunctionBegin;
1145:   /* strip and load s */
1146:   msg_list = list = gs->pair_list;
1147:   msg_size = size = gs->msg_sizes;
1148:   msg_nodes = nodes = gs->node_list;
1149:   iptr = pw = gs->pw_elm_list;
1150:   dptr1 = dptr3 = gs->pw_vals;
1151:   msg_ids_in = ids_in = gs->msg_ids_in;
1152:   msg_ids_out = ids_out = gs->msg_ids_out;
1153:   dptr2                 = gs->out;
1154:   in1 = in2 = gs->in;

1156:   /* post the receives */
1157:   /*  msg_nodes=nodes; */
1158:   do {
1159:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1160:         second one *list and do list++ afterwards */
1161:     PetscCallMPI(MPI_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1162:     list++;
1163:     msg_ids_in++;
1164:     in1 += *size++ * step;
1165:   } while (*++msg_nodes);
1166:   msg_nodes = nodes;

1168:   /* load gs values into in out gs buffers */
1169:   while (*iptr >= 0) {
1170:     PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step));
1171:     dptr3 += step;
1172:     iptr++;
1173:   }

1175:   /* load out buffers and post the sends */
1176:   while ((iptr = *msg_nodes++)) {
1177:     dptr3 = dptr2;
1178:     while (*iptr >= 0) {
1179:       PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step));
1180:       dptr2 += step;
1181:       iptr++;
1182:     }
1183:     PetscCallMPI(MPI_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1184:     msg_size++;
1185:     msg_list++;
1186:     msg_ids_out++;
1187:   }

1189:   /* tree */
1190:   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step));

1192:   /* process the received data */
1193:   msg_nodes = nodes;
1194:   while ((iptr = *nodes++)) {
1195:     PetscScalar d1 = 1.0;

1197:     /* Should I check the return value of MPI_Wait() or status? */
1198:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1199:     PetscCallMPI(MPI_Wait(ids_in, &status));
1200:     ids_in++;
1201:     while (*iptr >= 0) {
1202:       PetscCall(PetscBLASIntCast(step, &dstep));
1203:       PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1204:       in2 += step;
1205:       iptr++;
1206:     }
1207:   }

1209:   /* replace vals */
1210:   while (*pw >= 0) {
1211:     PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step));
1212:     dptr1 += step;
1213:     pw++;
1214:   }

1216:   /* clear isend message handles */
1217:   /* This changed for clarity though it could be the same */

1219:   /* Should I check the return value of MPI_Wait() or status? */
1220:   /* Can this loop be replaced by a call to MPI_Waitall()? */
1221:   while (*msg_nodes++) {
1222:     PetscCallMPI(MPI_Wait(ids_out, &status));
1223:     ids_out++;
1224:   }
1225:   PetscFunctionReturn(PETSC_SUCCESS);
1226: }

1228: /******************************************************************************/
1229: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1230: {
1231:   PetscInt     size, *in, *out;
1232:   PetscScalar *buf, *work;
1233:   PetscInt     op[] = {GL_ADD, 0};
1234:   PetscBLASInt i1   = 1;
1235:   PetscBLASInt dstep;

1237:   PetscFunctionBegin;
1238:   /* copy over to local variables */
1239:   in   = gs->tree_map_in;
1240:   out  = gs->tree_map_out;
1241:   buf  = gs->tree_buf;
1242:   work = gs->tree_work;
1243:   size = gs->tree_nel * step;

1245:   /* zero out collection buffer */
1246:   PetscCall(PCTFS_rvec_zero(buf, size));

1248:   /* copy over my contributions */
1249:   while (*in >= 0) {
1250:     PetscCall(PetscBLASIntCast(step, &dstep));
1251:     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1252:   }

1254:   /* perform fan in/out on full buffer */
1255:   /* must change PCTFS_grop to handle the blas */
1256:   PetscCall(PCTFS_grop(buf, work, size, op));

1258:   /* reset */
1259:   in  = gs->tree_map_in;
1260:   out = gs->tree_map_out;

1262:   /* get the portion of the results I need */
1263:   while (*in >= 0) {
1264:     PetscCall(PetscBLASIntCast(step, &dstep));
1265:     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1266:   }
1267:   PetscFunctionReturn(PETSC_SUCCESS);
1268: }

1270: /******************************************************************************/
1271: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1272: {
1273:   PetscFunctionBegin;
1274:   switch (*op) {
1275:   case '+':
1276:     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1277:     break;
1278:   default:
1279:     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
1280:     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
1281:     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1282:     break;
1283:   }
1284:   PetscFunctionReturn(PETSC_SUCCESS);
1285: }

1287: /******************************************************************************/
1288: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1289: {
1290:   PetscFunctionBegin;
1291:   /* if there's nothing to do return */
1292:   if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS);

1294:   /* can't do more dimensions then exist */
1295:   dim = PetscMin(dim, PCTFS_i_log2_num_nodes);

1297:   /* local only operations!!! */
1298:   if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals));

1300:   /* if intersection tree/pairwise and local isn't empty */
1301:   if (gs->num_local_gop) {
1302:     PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals));

1304:     /* pairwise will do tree inside ... */
1305:     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */
1306:     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));

1308:     PetscCall(PCTFS_gs_gop_local_out(gs, vals));
1309:   } else { /* if intersection tree/pairwise and local is empty */
1310:     /* pairwise will do tree inside */
1311:     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */
1312:     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1313:   }
1314:   PetscFunctionReturn(PETSC_SUCCESS);
1315: }

1317: /******************************************************************************/
1318: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1319: {
1320:   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1321:   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1322:   PetscInt    *pw, *list, *size, **nodes;
1323:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1324:   MPI_Status   status;
1325:   PetscInt     i, mask = 1;

1327:   PetscFunctionBegin;
1328:   for (i = 1; i < dim; i++) {
1329:     mask <<= 1;
1330:     mask++;
1331:   }

1333:   /* strip and load s */
1334:   msg_list = list = gs->pair_list;
1335:   msg_size = size = gs->msg_sizes;
1336:   msg_nodes = nodes = gs->node_list;
1337:   iptr = pw = gs->pw_elm_list;
1338:   dptr1 = dptr3 = gs->pw_vals;
1339:   msg_ids_in = ids_in = gs->msg_ids_in;
1340:   msg_ids_out = ids_out = gs->msg_ids_out;
1341:   dptr2                 = gs->out;
1342:   in1 = in2 = gs->in;

1344:   /* post the receives */
1345:   /*  msg_nodes=nodes; */
1346:   do {
1347:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1348:         second one *list and do list++ afterwards */
1349:     if ((PCTFS_my_id | mask) == (*list | mask)) {
1350:       PetscCallMPI(MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1351:       list++;
1352:       msg_ids_in++;
1353:       in1 += *size++;
1354:     } else {
1355:       list++;
1356:       size++;
1357:     }
1358:   } while (*++msg_nodes);

1360:   /* load gs values into in out gs buffers */
1361:   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);

1363:   /* load out buffers and post the sends */
1364:   msg_nodes = nodes;
1365:   list      = msg_list;
1366:   while ((iptr = *msg_nodes++)) {
1367:     if ((PCTFS_my_id | mask) == (*list | mask)) {
1368:       dptr3 = dptr2;
1369:       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1370:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1371:       /* is msg_ids_out++ correct? */
1372:       PetscCallMPI(MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1373:       msg_size++;
1374:       list++;
1375:       msg_ids_out++;
1376:     } else {
1377:       list++;
1378:       msg_size++;
1379:     }
1380:   }

1382:   /* do the tree while we're waiting */
1383:   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim));

1385:   /* process the received data */
1386:   msg_nodes = nodes;
1387:   list      = msg_list;
1388:   while ((iptr = *nodes++)) {
1389:     if ((PCTFS_my_id | mask) == (*list | mask)) {
1390:       /* Should I check the return value of MPI_Wait() or status? */
1391:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1392:       PetscCallMPI(MPI_Wait(ids_in, &status));
1393:       ids_in++;
1394:       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1395:     }
1396:     list++;
1397:   }

1399:   /* replace vals */
1400:   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;

1402:   /* clear isend message handles */
1403:   /* This changed for clarity though it could be the same */
1404:   while (*msg_nodes++) {
1405:     if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1406:       /* Should I check the return value of MPI_Wait() or status? */
1407:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1408:       PetscCallMPI(MPI_Wait(ids_out, &status));
1409:       ids_out++;
1410:     }
1411:     msg_list++;
1412:   }
1413:   PetscFunctionReturn(PETSC_SUCCESS);
1414: }

1416: /******************************************************************************/
1417: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1418: {
1419:   PetscInt     size;
1420:   PetscInt    *in, *out;
1421:   PetscScalar *buf, *work;
1422:   PetscInt     op[] = {GL_ADD, 0};

1424:   PetscFunctionBegin;
1425:   in   = gs->tree_map_in;
1426:   out  = gs->tree_map_out;
1427:   buf  = gs->tree_buf;
1428:   work = gs->tree_work;
1429:   size = gs->tree_nel;

1431:   PetscCall(PCTFS_rvec_zero(buf, size));

1433:   while (*in >= 0) *(buf + *out++) = *(vals + *in++);

1435:   in  = gs->tree_map_in;
1436:   out = gs->tree_map_out;

1438:   PetscCall(PCTFS_grop_hc(buf, work, size, op, dim));

1440:   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1441:   PetscFunctionReturn(PETSC_SUCCESS);
1442: }