-: 0:Source:/home/MPI/testing/mpich2/mpich2/src/mpi/coll/bcast.c
-: 0:Graph:bcast.gcno
-: 0:Data:bcast.gcda
-: 0:Runs:4383
-: 0:Programs:1376
-: 1:/* -*- Mode: C; c-basic-offset:4 ; -*- */
-: 2:/*
-: 3: *
-: 4: * (C) 2001 by Argonne National Laboratory.
-: 5: * See COPYRIGHT in top-level directory.
-: 6: */
-: 7:
-: 8:#include "mpiimpl.h"
-: 9:
-: 10:/* -- Begin Profiling Symbol Block for routine MPI_Bcast */
-: 11:#if defined(HAVE_PRAGMA_WEAK)
-: 12:#pragma weak MPI_Bcast = PMPI_Bcast
-: 13:#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
-: 14:#pragma _HP_SECONDARY_DEF PMPI_Bcast MPI_Bcast
-: 15:#elif defined(HAVE_PRAGMA_CRI_DUP)
-: 16:#pragma _CRI duplicate MPI_Bcast as PMPI_Bcast
-: 17:#endif
-: 18:/* -- End Profiling Symbol Block */
-: 19:
-: 20:/* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
-: 21: the MPI routines */
-: 22:#ifndef MPICH_MPI_FROM_PMPI
-: 23:#undef MPI_Bcast
-: 24:#define MPI_Bcast PMPI_Bcast
-: 25:
-: 26:/* FIXME move to somewhere else */
-: 27:/* Returns non-zero if val is a power of two. If ceil_pof2 is non-NULL, it sets
-: 28: *ceil_pof2 to the power of two that is just larger than or equal to val.
-: 29: That is, it rounds up to the nearest power of two. */
-: 30:static inline int MPIU_is_pof2(int val, int *ceil_pof2)
-: 31:{
23750: 32: int pof2 = 1;
-: 33:
113662: 34: while (pof2 < val)
89912: 35: pof2 *= 2;
23750: 36: if (ceil_pof2)
|
#####: 37: *ceil_pof2 = pof2;
-: 38:
|
23750: 39: if (pof2 == val)
3816: 40: return 1;
-: 41: else
19934: 42: return 0;
-: 43:}
-: 44:
-: 45:
-: 46:/* A binomial tree broadcast algorithm. Good for short messages,
-: 47: Cost = lgp.alpha + n.lgp.beta */
-: 48:#undef FUNCNAME
-: 49:#define FUNCNAME MPIR_Bcast_binomial
-: 50:#undef FCNAME
-: 51:#define FCNAME MPIU_QUOTE(FUNCNAME)
-: 52:static int MPIR_Bcast_binomial(
-: 53: void *buffer,
-: 54: int count,
-: 55: MPI_Datatype datatype,
-: 56: int root,
-: 57: MPID_Comm *comm_ptr)
2052386: 58:{
-: 59: int rank, comm_size, src, dst;
-: 60: int relative_rank, mask;
2052386: 61: int mpi_errno = MPI_SUCCESS;
2052386: 62: int nbytes=0;
-: 63: int type_size, is_contig, is_homogeneous;
-: 64: int position;
2052386: 65: void *tmp_buf=NULL;
-: 66: MPI_Comm comm;
-: 67: MPID_Datatype *dtp;
2052386: 68: MPIU_CHKLMEM_DECL(1);
-: 69:
2052386: 70: comm = comm_ptr->handle;
2052386: 71: comm_size = comm_ptr->local_size;
2052386: 72: rank = comm_ptr->rank;
-: 73:
-: 74: /* If there is only one process, return */
2052386: 75: if (comm_size == 1) goto fn_exit;
-: 76:
2022389: 77: if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)
1993755: 78: is_contig = 1;
-: 79: else {
28634: 80: MPID_Datatype_get_ptr(datatype, dtp);
28634: 81: is_contig = dtp->is_contig;
-: 82: }
-: 83:
2022389: 84: is_homogeneous = 1;
-: 85:#ifdef MPID_HAS_HETERO
-: 86: if (comm_ptr->is_hetero)
-: 87: is_homogeneous = 0;
-: 88:#endif
-: 89:
-: 90: /* MPI_Type_size() might not give the accurate size of the packed
-: 91: * datatype for heterogeneous systems (because of padding, encoding,
-: 92: * etc). On the other hand, MPI_Pack_size() can become very
-: 93: * expensive, depending on the implementation, especially for
-: 94: * heterogeneous systems. We want to use MPI_Type_size() wherever
-: 95: * possible, and MPI_Pack_size() in other places.
-: 96: */
2022389: 97: if (is_homogeneous) {
2022389: 98: MPID_Datatype_get_size_macro(datatype, type_size);
-: 99: }
-: 100: else {
|
#####: 101: mpi_errno = NMPI_Pack_size(1, datatype, comm, &type_size);
|
#####: 102: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 103: }
|
2022389: 104: nbytes = type_size * count;
-: 105:
2022389: 106: if (!is_contig || !is_homogeneous)
-: 107: {
8826: 108: MPIU_CHKLMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf");
-: 109:
-: 110: /* TODO: Pipeline the packing and communication */
8826: 111: position = 0;
8826: 112: if (rank == root)
1553: 113: NMPI_Pack(buffer, count, datatype, tmp_buf, nbytes,
-: 114: &position, comm);
-: 115: }
-: 116:
2022389: 117: relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
-: 118:
-: 119: /* Use short message algorithm, namely, binomial tree */
-: 120:
-: 121: /* Algorithm:
-: 122: This uses a fairly basic recursive subdivision algorithm.
-: 123: The root sends to the process comm_size/2 away; the receiver becomes
-: 124: a root for a subtree and applies the same process.
-: 125:
-: 126: So that the new root can easily identify the size of its
-: 127: subtree, the (subtree) roots are all powers of two (relative
-: 128: to the root) If m = the first power of 2 such that 2^m >= the
-: 129: size of the communicator, then the subtree at root at 2^(m-k)
-: 130: has size 2^k (with special handling for subtrees that aren't
-: 131: a power of two in size).
-: 132:
-: 133: Do subdivision. There are two phases:
-: 134: 1. Wait for arrival of data. Because of the power of two nature
-: 135: of the subtree roots, the source of this message is alwyas the
-: 136: process whose relative rank has the least significant 1 bit CLEARED.
-: 137: That is, process 4 (100) receives from process 0, process 7 (111)
-: 138: from process 6 (110), etc.
-: 139: 2. Forward to my subtree
-: 140:
-: 141: Note that the process that is the tree root is handled automatically
-: 142: by this code, since it has no bits set. */
-: 143:
2022389: 144: mask = 0x1;
5547079: 145: while (mask < comm_size)
-: 146: {
2836102: 147: if (relative_rank & mask)
-: 148: {
1333801: 149: src = rank - mask;
1333801: 150: if (src < 0) src += comm_size;
1333801: 151: if (!is_contig || !is_homogeneous)
7273: 152: mpi_errno = MPIC_Recv(tmp_buf,nbytes,MPI_BYTE,src,
-: 153: MPIR_BCAST_TAG,comm,MPI_STATUS_IGNORE);
-: 154: else
1326528: 155: mpi_errno = MPIC_Recv(buffer,count,datatype,src,
-: 156: MPIR_BCAST_TAG,comm,MPI_STATUS_IGNORE);
|
1333801: 157: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 158: break;
-: 159: }
|
1502301: 160: mask <<= 1;
-: 161: }
-: 162:
-: 163: /* This process is responsible for all processes that have bits
-: 164: set from the LSB upto (but not including) mask. Because of
-: 165: the "not including", we start by shifting mask back down one.
-: 166:
-: 167: We can easily change to a different algorithm at any power of two
-: 168: by changing the test (mask > 1) to (mask > block_size)
-: 169:
-: 170: One such version would use non-blocking operations for the last 2-4
-: 171: steps (this also bounds the number of MPI_Requests that would
-: 172: be needed). */
-: 173:
2022388: 174: mask >>= 1;
5547077: 175: while (mask > 0)
-: 176: {
1502301: 177: if (relative_rank + mask < comm_size)
-: 178: {
1333800: 179: dst = rank + mask;
1333800: 180: if (dst >= comm_size) dst -= comm_size;
1333800: 181: if (!is_contig || !is_homogeneous)
7553: 182: mpi_errno = MPIC_Send(tmp_buf,nbytes,MPI_BYTE,dst,
-: 183: MPIR_BCAST_TAG,comm);
-: 184: else
1326247: 185: mpi_errno = MPIC_Send(buffer,count,datatype,dst,
-: 186: MPIR_BCAST_TAG,comm);
|
1333800: 187: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 188: }
|
1502301: 189: mask >>= 1;
-: 190: }
-: 191:
2022388: 192: if (!is_contig || !is_homogeneous)
-: 193: {
8826: 194: if (rank != root)
-: 195: {
7273: 196: position = 0;
7273: 197: NMPI_Unpack(tmp_buf, nbytes, &position, buffer, count,
-: 198: datatype, comm);
-: 199: }
-: 200: }
-: 201:
-: 202:fn_exit:
8826: 203: MPIU_CHKLMEM_FREEALL();
2052386: 204: return mpi_errno;
|
-: 205:fn_fail:
-: 206: goto fn_exit;
-: 207:}
-: 208:
-: 209:/* FIXME it would be nice if we could refactor things to minimize
-: 210: duplication between this and MPIR_Scatter and friends. We can't use
-: 211: MPIR_Scatter as is without inducing an extra copy in the noncontig case. */
-: 212:/* There are additional arguments included here that are unused because we
-: 213: always assume that the noncontig case has been packed into a contig case by
-: 214: the caller for now. Once we start handling noncontig data at the upper level
-: 215: we can start handling it here.
-: 216:
-: 217: At the moment this function always scatters a buffer of nbytes starting at
-: 218: tmp_buf address. */
-: 219:#undef FUNCNAME
-: 220:#define FUNCNAME scatter_for_bcast
-: 221:#undef FCNAME
-: 222:#define FCNAME MPIU_QUOTE(FUNCNAME)
-: 223:static int scatter_for_bcast(
-: 224: void *buffer ATTRIBUTE((unused)),
-: 225: int count ATTRIBUTE((unused)),
-: 226: MPI_Datatype datatype ATTRIBUTE((unused)),
-: 227: int root,
-: 228: MPID_Comm *comm_ptr,
-: 229: int nbytes,
-: 230: void *tmp_buf,
-: 231: int is_contig,
-: 232: int is_homogeneous)
|
22654: 233:{
-: 234: MPI_Status status;
-: 235: int rank, comm_size, src, dst;
-: 236: int relative_rank, mask;
22654: 237: int mpi_errno = MPI_SUCCESS;
22654: 238: int scatter_size, curr_size, recv_size = 0, send_size;
-: 239: MPI_Comm comm;
-: 240:
22654: 241: comm = comm_ptr->handle;
22654: 242: comm_size = comm_ptr->local_size;
22654: 243: rank = comm_ptr->rank;
22654: 244: relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
-: 245:
-: 246: /* use long message algorithm: binomial tree scatter followed by an allgather */
-: 247:
-: 248: /* The scatter algorithm divides the buffer into nprocs pieces and
-: 249: scatters them among the processes. Root gets the first piece,
-: 250: root+1 gets the second piece, and so forth. Uses the same binomial
-: 251: tree algorithm as above. Ceiling division
-: 252: is used to compute the size of each piece. This means some
-: 253: processes may not get any data. For example if bufsize = 97 and
-: 254: nprocs = 16, ranks 15 and 16 will get 0 data. On each process, the
-: 255: scattered data is stored at the same offset in the buffer as it is
-: 256: on the root process. */
-: 257:
22654: 258: scatter_size = (nbytes + comm_size - 1)/comm_size; /* ceiling division */
22654: 259: curr_size = (rank == root) ? nbytes : 0; /* root starts with all the
-: 260: data */
-: 261:
22654: 262: mask = 0x1;
70724: 263: while (mask < comm_size)
-: 264: {
45742: 265: if (relative_rank & mask)
-: 266: {
20326: 267: src = rank - mask;
20326: 268: if (src < 0) src += comm_size;
20326: 269: recv_size = nbytes - relative_rank*scatter_size;
-: 270: /* recv_size is larger than what might actually be sent by the
-: 271: sender. We don't need compute the exact value because MPI
-: 272: allows you to post a larger recv.*/
20326: 273: if (recv_size <= 0)
-: 274: {
|
#####: 275: curr_size = 0; /* this process doesn't receive any data
-: 276: because of uneven division */
-: 277: }
-: 278: else
-: 279: {
|
20326: 280: mpi_errno = MPIC_Recv(((char *)tmp_buf +
-: 281: relative_rank*scatter_size),
-: 282: recv_size, MPI_BYTE, src,
-: 283: MPIR_BCAST_TAG, comm, &status);
|
20326: 284: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 285:
-: 286: /* query actual size of data received */
|
20326: 287: NMPI_Get_count(&status, MPI_BYTE, &curr_size);
-: 288: }
-: 289: break;
-: 290: }
25416: 291: mask <<= 1;
-: 292: }
-: 293:
-: 294: /* This process is responsible for all processes that have bits
-: 295: set from the LSB upto (but not including) mask. Because of
-: 296: the "not including", we start by shifting mask back down
-: 297: one. */
-: 298:
22654: 299: mask >>= 1;
70724: 300: while (mask > 0)
-: 301: {
25416: 302: if (relative_rank + mask < comm_size)
-: 303: {
20326: 304: send_size = curr_size - scatter_size * mask;
-: 305: /* mask is also the size of this process's subtree */
-: 306:
20326: 307: if (send_size > 0)
-: 308: {
20326: 309: dst = rank + mask;
20326: 310: if (dst >= comm_size) dst -= comm_size;
20326: 311: mpi_errno = MPIC_Send (((char *)tmp_buf +
-: 312: scatter_size*(relative_rank+mask)),
-: 313: send_size, MPI_BYTE, dst,
-: 314: MPIR_BCAST_TAG, comm);
|
20326: 315: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 316:
|
20326: 317: curr_size -= send_size;
-: 318: }
-: 319: }
25416: 320: mask >>= 1;
-: 321: }
-: 322:
22654: 323:fn_exit:
22654: 324: return mpi_errno;
|
-: 325:fn_fail:
-: 326: goto fn_exit;
-: 327:}
-: 328:
-: 329:/*
-: 330: Broadcast based on a scatter followed by an allgather.
-: 331:
-: 332: We first scatter the buffer using a binomial tree algorithm. This costs
-: 333: lgp.alpha + n.((p-1)/p).beta
-: 334: If the datatype is contiguous and the communicator is homogeneous,
-: 335: we treat the data as bytes and divide (scatter) it among processes
-: 336: by using ceiling division. For the noncontiguous or heterogeneous
-: 337: cases, we first pack the data into a temporary buffer by using
-: 338: MPI_Pack, scatter it as bytes, and unpack it after the allgather.
-: 339:
-: 340: For the allgather, we use a recursive doubling algorithm for
-: 341: medium-size messages and power-of-two number of processes. This
-: 342: takes lgp steps. In each step pairs of processes exchange all the
-: 343: data they have (we take care of non-power-of-two situations). This
-: 344: costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
-: 345: because it may be slightly more in the non-power-of-two case, but
-: 346: it's still a logarithmic algorithm.) Therefore, for long messages
-: 347: Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
-: 348:*/
-: 349:#undef FUNCNAME
-: 350:#define FUNCNAME MPIR_Bcast_scatter_doubling_allgather
-: 351:#undef FCNAME
-: 352:#define FCNAME MPIU_QUOTE(FUNCNAME)
-: 353:static int MPIR_Bcast_scatter_doubling_allgather(
-: 354: void *buffer,
-: 355: int count,
-: 356: MPI_Datatype datatype,
-: 357: int root,
-: 358: MPID_Comm *comm_ptr)
|
424: 359:{
-: 360: MPI_Status status;
-: 361: int rank, comm_size, dst;
-: 362: int relative_rank, mask;
424: 363: int mpi_errno = MPI_SUCCESS;
424: 364: int scatter_size, nbytes=0, curr_size, recv_size = 0;
-: 365: int type_size, j, k, i, tmp_mask, is_contig, is_homogeneous;
-: 366: int relative_dst, dst_tree_root, my_tree_root, send_offset;
-: 367: int recv_offset, tree_root, nprocs_completed, offset, position;
424: 368: MPIU_CHKLMEM_DECL(1);
-: 369: MPI_Comm comm;
-: 370: MPID_Datatype *dtp;
-: 371: MPI_Aint true_extent, true_lb;
-: 372: void *tmp_buf;
-: 373:
424: 374: comm = comm_ptr->handle;
424: 375: comm_size = comm_ptr->local_size;
424: 376: rank = comm_ptr->rank;
424: 377: relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
-: 378:
-: 379: /* If there is only one process, return */
424: 380: if (comm_size == 1) goto fn_exit;
-: 381:
|
#####: 382: if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)
#####: 383: is_contig = 1;
-: 384: else {
#####: 385: MPID_Datatype_get_ptr(datatype, dtp);
#####: 386: is_contig = dtp->is_contig;
-: 387: }
-: 388:
#####: 389: is_homogeneous = 1;
-: 390:#ifdef MPID_HAS_HETERO
-: 391: if (comm_ptr->is_hetero)
-: 392: is_homogeneous = 0;
-: 393:#endif
-: 394:
-: 395: /* MPI_Type_size() might not give the accurate size of the packed
-: 396: * datatype for heterogeneous systems (because of padding, encoding,
-: 397: * etc). On the other hand, MPI_Pack_size() can become very
-: 398: * expensive, depending on the implementation, especially for
-: 399: * heterogeneous systems. We want to use MPI_Type_size() wherever
-: 400: * possible, and MPI_Pack_size() in other places.
-: 401: */
#####: 402: if (is_homogeneous) {
#####: 403: MPID_Datatype_get_size_macro(datatype, type_size);
-: 404: }
-: 405: else {
#####: 406: mpi_errno = NMPI_Pack_size(1, datatype, comm, &type_size);
|
#####: 407: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 408: }
|
#####: 409: nbytes = type_size * count;
-: 410:
#####: 411: if (is_contig && is_homogeneous)
-: 412: {
-: 413: /* contiguous and homogeneous. no need to pack. */
#####: 414: mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb,
-: 415: &true_extent);
|
#####: 416: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 417:
|
#####: 418: tmp_buf = (char *) buffer + true_lb;
-: 419: }
-: 420: else
-: 421: {
#####: 422: MPIU_CHKLMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf");
-: 423:
-: 424: /* TODO: Pipeline the packing and communication */
#####: 425: position = 0;
#####: 426: if (rank == root)
#####: 427: NMPI_Pack(buffer, count, datatype, tmp_buf, nbytes,
-: 428: &position, comm);
-: 429: }
-: 430:
-: 431:
#####: 432: scatter_size = (nbytes + comm_size - 1)/comm_size; /* ceiling division */
#####: 433: curr_size = (rank == root) ? nbytes : 0; /* root starts with all the
-: 434: data */
-: 435:
-: 436:
#####: 437: mpi_errno = scatter_for_bcast(buffer, count, datatype, root, comm_ptr,
-: 438: nbytes, tmp_buf, is_contig, is_homogeneous);
|
#####: 439: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 440:
-: 441: /* medium size allgather and pof2 comm_size. use recurive doubling. */
-: 442:
|
#####: 443: mask = 0x1;
#####: 444: i = 0;
#####: 445: while (mask < comm_size)
-: 446: {
#####: 447: relative_dst = relative_rank ^ mask;
-: 448:
#####: 449: dst = (relative_dst + root) % comm_size;
-: 450:
-: 451: /* find offset into send and recv buffers.
-: 452: zero out the least significant "i" bits of relative_rank and
-: 453: relative_dst to find root of src and dst
-: 454: subtrees. Use ranks of roots as index to send from
-: 455: and recv into buffer */
-: 456:
#####: 457: dst_tree_root = relative_dst >> i;
#####: 458: dst_tree_root <<= i;
-: 459:
#####: 460: my_tree_root = relative_rank >> i;
#####: 461: my_tree_root <<= i;
-: 462:
#####: 463: send_offset = my_tree_root * scatter_size;
#####: 464: recv_offset = dst_tree_root * scatter_size;
-: 465:
#####: 466: if (relative_dst < comm_size)
-: 467: {
#####: 468: mpi_errno = MPIC_Sendrecv(((char *)tmp_buf + send_offset),
-: 469: curr_size, MPI_BYTE, dst, MPIR_BCAST_TAG,
-: 470: ((char *)tmp_buf + recv_offset),
-: 471: (nbytes-recv_offset < 0 ? 0 : nbytes-recv_offset),
-: 472: MPI_BYTE, dst, MPIR_BCAST_TAG, comm, &status);
|
#####: 473: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 474:
|
#####: 475: NMPI_Get_count(&status, MPI_BYTE, &recv_size);
#####: 476: curr_size += recv_size;
-: 477: }
-: 478:
-: 479: /* if some processes in this process's subtree in this step
-: 480: did not have any destination process to communicate with
-: 481: because of non-power-of-two, we need to send them the
-: 482: data that they would normally have received from those
-: 483: processes. That is, the haves in this subtree must send to
-: 484: the havenots. We use a logarithmic recursive-halfing algorithm
-: 485: for this. */
-: 486:
-: 487: /* This part of the code will not currently be
-: 488: executed because we are not using recursive
-: 489: doubling for non power of two. Mark it as experimental
-: 490: so that it doesn't show up as red in the coverage tests. */
-: 491:
|
-: 492: /* --BEGIN EXPERIMENTAL-- */
#####: 493: if (dst_tree_root + mask > comm_size)
-: 494: {
#####: 495: nprocs_completed = comm_size - my_tree_root - mask;
-: 496: /* nprocs_completed is the number of processes in this
-: 497: subtree that have all the data. Send data to others
-: 498: in a tree fashion. First find root of current tree
-: 499: that is being divided into two. k is the number of
-: 500: least-significant bits in this process's rank that
-: 501: must be zeroed out to find the rank of the root */
#####: 502: j = mask;
#####: 503: k = 0;
#####: 504: while (j)
-: 505: {
#####: 506: j >>= 1;
#####: 507: k++;
-: 508: }
#####: 509: k--;
-: 510:
#####: 511: offset = scatter_size * (my_tree_root + mask);
#####: 512: tmp_mask = mask >> 1;
-: 513:
#####: 514: while (tmp_mask)
-: 515: {
#####: 516: relative_dst = relative_rank ^ tmp_mask;
#####: 517: dst = (relative_dst + root) % comm_size;
-: 518:
#####: 519: tree_root = relative_rank >> k;
#####: 520: tree_root <<= k;
-: 521:
-: 522: /* send only if this proc has data and destination
-: 523: doesn't have data. */
-: 524:
-: 525: /* if (rank == 3) {
-: 526: printf("rank %d, dst %d, root %d, nprocs_completed %d\n", relative_rank, relative_dst, tree_root, nprocs_completed);
-: 527: fflush(stdout);
-: 528: }*/
-: 529:
#####: 530: if ((relative_dst > relative_rank) &&
-: 531: (relative_rank < tree_root + nprocs_completed)
-: 532: && (relative_dst >= tree_root + nprocs_completed))
-: 533: {
-: 534:
-: 535: /* printf("Rank %d, send to %d, offset %d, size %d\n", rank, dst, offset, recv_size);
-: 536: fflush(stdout); */
#####: 537: mpi_errno = MPIC_Send(((char *)tmp_buf + offset),
-: 538: recv_size, MPI_BYTE, dst,
-: 539: MPIR_BCAST_TAG, comm);
-: 540: /* recv_size was set in the previous
-: 541: receive. that's the amount of data to be
-: 542: sent now. */
#####: 543: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 544: }
-: 545: /* recv only if this proc. doesn't have data and sender
-: 546: has data */
#####: 547: else if ((relative_dst < relative_rank) &&
-: 548: (relative_dst < tree_root + nprocs_completed) &&
-: 549: (relative_rank >= tree_root + nprocs_completed))
-: 550: {
-: 551: /* printf("Rank %d waiting to recv from rank %d\n",
-: 552: relative_rank, dst); */
#####: 553: mpi_errno = MPIC_Recv(((char *)tmp_buf + offset),
-: 554: nbytes - offset,
-: 555: MPI_BYTE, dst, MPIR_BCAST_TAG,
-: 556: comm, &status);
-: 557: /* nprocs_completed is also equal to the no. of processes
-: 558: whose data we don't have */
#####: 559: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 560:
#####: 561: NMPI_Get_count(&status, MPI_BYTE, &recv_size);
#####: 562: curr_size += recv_size;
-: 563: /* printf("Rank %d, recv from %d, offset %d, size %d\n", rank, dst, offset, recv_size);
-: 564: fflush(stdout);*/
-: 565: }
#####: 566: tmp_mask >>= 1;
#####: 567: k--;
-: 568: }
-: 569: }
-: 570: /* --END EXPERIMENTAL-- */
-: 571:
|
#####: 572: mask <<= 1;
#####: 573: i++;
-: 574: }
-: 575:
#####: 576: if (!is_contig || !is_homogeneous)
-: 577: {
#####: 578: if (rank != root)
-: 579: {
#####: 580: position = 0;
#####: 581: NMPI_Unpack(tmp_buf, nbytes, &position, buffer, count,
-: 582: datatype, comm);
-: 583: }
-: 584: }
-: 585:
-: 586:fn_exit:
#####: 587: MPIU_CHKLMEM_FREEALL();
|
424: 588: return mpi_errno;
|
-: 589:fn_fail:
-: 590: goto fn_exit;
-: 591:}
-: 592:
-: 593:/*
-: 594: Broadcast based on a scatter followed by an allgather.
-: 595:
-: 596: We first scatter the buffer using a binomial tree algorithm. This costs
-: 597: lgp.alpha + n.((p-1)/p).beta
-: 598: If the datatype is contiguous and the communicator is homogeneous,
-: 599: we treat the data as bytes and divide (scatter) it among processes
-: 600: by using ceiling division. For the noncontiguous or heterogeneous
-: 601: cases, we first pack the data into a temporary buffer by using
-: 602: MPI_Pack, scatter it as bytes, and unpack it after the allgather.
-: 603:
-: 604: We use a ring algorithm for the allgather, which takes p-1 steps.
-: 605: This may perform better than recursive doubling for long messages and
-: 606: medium-sized non-power-of-two messages.
-: 607: Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
-: 608:*/
-: 609:#undef FUNCNAME
-: 610:#define FUNCNAME MPIR_Bcast_scatter_ring_allgather
-: 611:#undef FCNAME
-: 612:#define FCNAME MPIU_QUOTE(FUNCNAME)
-: 613:static int MPIR_Bcast_scatter_ring_allgather(
-: 614: void *buffer,
-: 615: int count,
-: 616: MPI_Datatype datatype,
-: 617: int root,
-: 618: MPID_Comm *comm_ptr)
|
22654: 619:{
-: 620: int rank, comm_size;
-: 621: int relative_rank;
22654: 622: int mpi_errno = MPI_SUCCESS;
-: 623: int scatter_size, nbytes;
-: 624: int type_size, j, i, is_contig, is_homogeneous;
-: 625: int position;
-: 626: int *recvcnts, *displs, left, right, jnext;
-: 627: void *tmp_buf;
-: 628: MPI_Comm comm;
-: 629: MPID_Datatype *dtp;
-: 630: MPI_Aint true_extent, true_lb;
22654: 631: MPIU_CHKLMEM_DECL(3);
-: 632:
22654: 633: comm = comm_ptr->handle;
22654: 634: comm_size = comm_ptr->local_size;
22654: 635: rank = comm_ptr->rank;
22654: 636: relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
-: 637:
-: 638: /* If there is only one process, return */
22654: 639: if (comm_size == 1) goto fn_exit;
-: 640:
22654: 641: if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)
14374: 642: is_contig = 1;
-: 643: else {
8280: 644: MPID_Datatype_get_ptr(datatype, dtp);
8280: 645: is_contig = dtp->is_contig;
-: 646: }
-: 647:
22654: 648: is_homogeneous = 1;
-: 649:#ifdef MPID_HAS_HETERO
-: 650: if (comm_ptr->is_hetero)
-: 651: is_homogeneous = 0;
-: 652:#endif
-: 653:
-: 654: /* MPI_Type_size() might not give the accurate size of the packed
-: 655: * datatype for heterogeneous systems (because of padding, encoding,
-: 656: * etc). On the other hand, MPI_Pack_size() can become very
-: 657: * expensive, depending on the implementation, especially for
-: 658: * heterogeneous systems. We want to use MPI_Type_size() wherever
-: 659: * possible, and MPI_Pack_size() in other places.
-: 660: */
22654: 661: if (is_homogeneous) {
22654: 662: MPID_Datatype_get_size_macro(datatype, type_size);
-: 663: }
-: 664: else {
|
#####: 665: mpi_errno = NMPI_Pack_size(1, datatype, comm, &type_size);
|
#####: 666: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 667: }
|
22654: 668: nbytes = type_size * count;
-: 669:
22654: 670: if (is_contig && is_homogeneous)
-: 671: {
-: 672: /* contiguous and homogeneous. no need to pack. */
19010: 673: mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb,
-: 674: &true_extent);
|
19010: 675: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 676:
|
19010: 677: tmp_buf = (char *) buffer + true_lb;
-: 678: }
-: 679: else
-: 680: {
3644: 681: MPIU_CHKLMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf");
-: 682:
-: 683: /* TODO: Pipeline the packing and communication */
3644: 684: position = 0;
3644: 685: if (rank == root)
796: 686: NMPI_Pack(buffer, count, datatype, tmp_buf, nbytes,
-: 687: &position, comm);
-: 688: }
-: 689:
22654: 690: scatter_size = (nbytes + comm_size - 1)/comm_size; /* ceiling division */
-: 691:
22654: 692: mpi_errno = scatter_for_bcast(buffer, count, datatype, root, comm_ptr,
-: 693: nbytes, tmp_buf, is_contig, is_homogeneous);
|
22654: 694: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 695:
-: 696: /* long-message allgather or medium-size but non-power-of-two. use ring algorithm. */
-: 697:
|
22654: 698: MPIU_CHKLMEM_MALLOC(recvcnts, int *, comm_size*sizeof(int), mpi_errno, "recvcnts");
22654: 699: MPIU_CHKLMEM_MALLOC(displs, int *, comm_size*sizeof(int), mpi_errno, "displs");
-: 700:
243656: 701: for (i=0; i<comm_size; i++)
-: 702: {
221002: 703: recvcnts[i] = nbytes - i*scatter_size;
221002: 704: if (recvcnts[i] > scatter_size)
198348: 705: recvcnts[i] = scatter_size;
221002: 706: if (recvcnts[i] < 0)
|
#####: 707: recvcnts[i] = 0;
-: 708: }
-: 709:
|
22654: 710: displs[0] = 0;
221002: 711: for (i=1; i<comm_size; i++)
198348: 712: displs[i] = displs[i-1] + recvcnts[i-1];
-: 713:
22654: 714: left = (comm_size + rank - 1) % comm_size;
22654: 715: right = (rank + 1) % comm_size;
-: 716:
22654: 717: j = rank;
22654: 718: jnext = left;
221002: 719: for (i=1; i<comm_size; i++)
-: 720: {
198348: 721: mpi_errno =
-: 722: MPIC_Sendrecv((char *)tmp_buf +
-: 723: displs[(j-root+comm_size)%comm_size],
-: 724: recvcnts[(j-root+comm_size)%comm_size],
-: 725: MPI_BYTE, right, MPIR_BCAST_TAG,
-: 726: (char *)tmp_buf +
-: 727: displs[(jnext-root+comm_size)%comm_size],
-: 728: recvcnts[(jnext-root+comm_size)%comm_size],
-: 729: MPI_BYTE, left,
-: 730: MPIR_BCAST_TAG, comm, MPI_STATUS_IGNORE);
|
198348: 731: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 732:
|
198348: 733: j = jnext;
198348: 734: jnext = (comm_size + jnext - 1) % comm_size;
-: 735: }
-: 736:
22654: 737: if (!is_contig || !is_homogeneous)
-: 738: {
3644: 739: if (rank != root)
-: 740: {
2848: 741: position = 0;
2848: 742: NMPI_Unpack(tmp_buf, nbytes, &position, buffer, count,
-: 743: datatype, comm);
-: 744: }
-: 745: }
-: 746:
-: 747:fn_exit:
48952: 748: MPIU_CHKLMEM_FREEALL();
22654: 749: return mpi_errno;
|
-: 750:fn_fail:
-: 751: goto fn_exit;
-: 752:}
-: 753:
-: 754:/* A convenience function to de-clutter code and minimize copy-paste bugs.
-: 755: Invokes the coll_fns->Bcast override for the given communicator if non-null.
-: 756: Otherwise it invokes bcast_fn_ with the given args.
-: 757:
-: 758: NOTE: calls MPIU_ERR_POP on any failure, so a fn_fail label is needed. */
-: 759:#define MPIR_Bcast_fn_or_override(bcast_fn_,mpi_errno_,buffer_,count_,datatype_,root_,comm_ptr_) \
-: 760: do { \
-: 761: if (comm_ptr_->coll_fns != NULL && comm_ptr_->coll_fns->Bcast != NULL) \
-: 762: { \
-: 763: /* --BEGIN USEREXTENSION-- */ \
-: 764: mpi_errno_ = comm_ptr->coll_fns->Bcast(buffer_, count_, \
-: 765: datatype_, root_, comm_ptr_); \
-: 766: /* --END USEREXTENSION-- */ \
-: 767: } \
-: 768: else \
-: 769: { \
-: 770: mpi_errno_ = bcast_fn_(buffer_, count_, datatype_, root_, comm_ptr_); \
-: 771: } \
-: 772: if (mpi_errno_) MPIU_ERR_POP(mpi_errno_); \
-: 773: } while (0)
-: 774:
-: 775:/* FIXME This function uses some heuristsics based off of some testing on a
-: 776: * cluster at Argonne. We need a better system for detrmining and controlling
-: 777: * the cutoff points for these algorithms. If I've done this right, you should
-: 778: * be able to make changes along these lines almost exclusively in this function
-: 779: * and some new functions. [goodell@ 2008/01/07] */
-: 780:/* begin:nested */
-: 781:static int MPIR_SMP_Bcast(
-: 782: void *buffer,
-: 783: int count,
-: 784: MPI_Datatype datatype,
-: 785: int root,
-: 786: MPID_Comm *comm_ptr)
|
128232: 787:{
128232: 788: int mpi_errno = MPI_SUCCESS;
-: 789: int type_size, is_homogeneous;
128232: 790: int nbytes=0;
-: 791:
-: 792:#if !defined(USE_SMP_COLLECTIVES)
-: 793: MPIU_Assert(0);
-: 794:#endif
128232: 795: MPIU_Assert(MPIR_Comm_is_node_aware(comm_ptr));
-: 796:
128232: 797: is_homogeneous = 1;
-: 798:#ifdef MPID_HAS_HETERO
-: 799: if (comm_ptr->is_hetero)
-: 800: is_homogeneous = 0;
-: 801:#endif
-: 802:
-: 803: /* MPI_Type_size() might not give the accurate size of the packed
-: 804: * datatype for heterogeneous systems (because of padding, encoding,
-: 805: * etc). On the other hand, MPI_Pack_size() can become very
-: 806: * expensive, depending on the implementation, especially for
-: 807: * heterogeneous systems. We want to use MPI_Type_size() wherever
-: 808: * possible, and MPI_Pack_size() in other places.
-: 809: */
128232: 810: if (is_homogeneous) {
128232: 811: MPID_Datatype_get_size_macro(datatype, type_size);
-: 812: }
-: 813: else {
|
#####: 814: mpi_errno = NMPI_Pack_size(1, datatype, comm_ptr->handle, &type_size);
|
#####: 815: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 816: }
|
128232: 817: nbytes = type_size * count;
-: 818:
128232: 819: if ((nbytes < MPIR_BCAST_SHORT_MSG) || (comm_ptr->local_size < MPIR_BCAST_MIN_PROCS))
-: 820: {
-: 821: /* send to intranode-rank 0 on the root's node */
102186: 822: if (comm_ptr->node_comm != NULL &&
-: 823: MPIU_Get_intranode_rank(comm_ptr, root) > 0) /* is not the node root (0) */
-: 824: { /* and is on our node (!-1) */
57112: 825: if (root == comm_ptr->rank) {
8066: 826: mpi_errno = MPIC_Send(buffer,count,datatype,0,
-: 827: MPIR_BCAST_TAG,comm_ptr->node_comm->handle);
-: 828: }
49046: 829: else if (0 == comm_ptr->node_comm->rank) {
8066: 830: mpi_errno = MPIC_Recv(buffer,count,datatype,MPIU_Get_intranode_rank(comm_ptr, root),
-: 831: MPIR_BCAST_TAG,comm_ptr->node_comm->handle,MPI_STATUS_IGNORE);
-: 832: }
|
57112: 833: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 834: }
-: 835:
-: 836: /* perform the internode broadcast */
|
102186: 837: if (comm_ptr->node_roots_comm != NULL)
-: 838: {
20786: 839: MPIR_Bcast_fn_or_override(MPIR_Bcast_binomial, mpi_errno,
-: 840: buffer, count, datatype,
-: 841: MPIU_Get_internode_rank(comm_ptr, root),
-: 842: comm_ptr->node_roots_comm);
-: 843: }
-: 844:
-: 845: /* perform the intranode broadcast on all except for the root's node */
102186: 846: if (comm_ptr->node_comm != NULL)
-: 847: {
102186: 848: MPIR_Bcast_fn_or_override(MPIR_Bcast_binomial, mpi_errno,
-: 849: buffer, count, datatype, 0, comm_ptr->node_comm);
-: 850: }
-: 851: }
-: 852: else /* (nbytes > MPIR_BCAST_SHORT_MSG) && (comm_ptr->size >= MPIR_BCAST_MIN_PROCS) */
-: 853: {
-: 854: /* supposedly...
-: 855: smp+doubling good for pof2
-: 856: reg+ring better for non-pof2 */
49372: 857: if (nbytes < MPIR_BCAST_LONG_MSG && MPIU_is_pof2(comm_ptr->local_size, NULL))
-: 858: {
-: 859: /* medium-sized msg and pof2 np */
-: 860:
-: 861: /* perform the intranode broadcast on the root's node */
3392: 862: if (comm_ptr->node_comm != NULL &&
-: 863: MPIU_Get_intranode_rank(comm_ptr, root) > 0) /* is not the node root (0) */
-: 864: { /* and is on our node (!-1) */
-: 865: /* FIXME binomial may not be the best algorithm for on-node
-: 866: bcast. We need a more comprehensive system for selecting the
-: 867: right algorithms here. */
2912: 868: MPIR_Bcast_fn_or_override(MPIR_Bcast_binomial, mpi_errno,
-: 869: buffer, count, datatype,
-: 870: MPIU_Get_intranode_rank(comm_ptr, root),
-: 871: comm_ptr->node_comm);
-: 872: }
-: 873:
-: 874: /* perform the internode broadcast */
3392: 875: if (comm_ptr->node_roots_comm != NULL)
-: 876: {
848: 877: if (MPIU_is_pof2(comm_ptr->node_roots_comm->local_size, NULL))
-: 878: {
424: 879: MPIR_Bcast_fn_or_override(MPIR_Bcast_scatter_doubling_allgather, mpi_errno,
-: 880: buffer, count, datatype,
-: 881: MPIU_Get_internode_rank(comm_ptr, root),
-: 882: comm_ptr->node_roots_comm);
-: 883: }
-: 884: else
-: 885: {
|
#####: 886: MPIR_Bcast_fn_or_override(MPIR_Bcast_scatter_ring_allgather, mpi_errno,
-: 887: buffer, count, datatype,
-: 888: MPIU_Get_internode_rank(comm_ptr, root),
-: 889: comm_ptr->node_roots_comm);
-: 890: }
-: 891: }
-: 892:
-: 893: /* perform the intranode broadcast on all except for the root's node */
|
3392: 894: if (comm_ptr->node_comm != NULL &&
-: 895: MPIU_Get_intranode_rank(comm_ptr, root) <= 0) /* 0 if root was local root too */
-: 896: { /* -1 if different node than root */
-: 897: /* FIXME binomial may not be the best algorithm for on-node
-: 898: bcast. We need a more comprehensive system for selecting the
-: 899: right algorithms here. */
480: 900: MPIR_Bcast_fn_or_override(MPIR_Bcast_binomial, mpi_errno,
-: 901: buffer, count, datatype, 0, comm_ptr->node_comm);
-: 902: }
-: 903: }
-: 904: else /* large msg or non-pof2 */
-: 905: {
-: 906: /* FIXME It would be good to have an SMP-aware version of this
-: 907: algorithm that (at least approximately) minimized internode
-: 908: communication. */
22654: 909: mpi_errno = MPIR_Bcast_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr);
|
22654: 910: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 911: }
-: 912: }
-: 913:
|
128232: 914:fn_exit:
128232: 915: return mpi_errno;
|
-: 916:fn_fail:
-: 917: goto fn_exit;
-: 918:}
-: 919:/* end:nested */
-: 920:
-: 921:/* This is the default implementation of broadcast. The algorithm is:
-: 922:
-: 923: Algorithm: MPI_Bcast
-: 924:
-: 925: For short messages, we use a binomial tree algorithm.
-: 926: Cost = lgp.alpha + n.lgp.beta
-: 927:
-: 928: For long messages, we do a scatter followed by an allgather.
-: 929: We first scatter the buffer using a binomial tree algorithm. This costs
-: 930: lgp.alpha + n.((p-1)/p).beta
-: 931: If the datatype is contiguous and the communicator is homogeneous,
-: 932: we treat the data as bytes and divide (scatter) it among processes
-: 933: by using ceiling division. For the noncontiguous or heterogeneous
-: 934: cases, we first pack the data into a temporary buffer by using
-: 935: MPI_Pack, scatter it as bytes, and unpack it after the allgather.
-: 936:
-: 937: For the allgather, we use a recursive doubling algorithm for
-: 938: medium-size messages and power-of-two number of processes. This
-: 939: takes lgp steps. In each step pairs of processes exchange all the
-: 940: data they have (we take care of non-power-of-two situations). This
-: 941: costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
-: 942: because it may be slightly more in the non-power-of-two case, but
-: 943: it's still a logarithmic algorithm.) Therefore, for long messages
-: 944: Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
-: 945:
-: 946: Note that this algorithm has twice the latency as the tree algorithm
-: 947: we use for short messages, but requires lower bandwidth: 2.n.beta
-: 948: versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
-: 949: this algorithm will perform better.
-: 950:
-: 951: For long messages and for medium-size messages and non-power-of-two
-: 952: processes, we use a ring algorithm for the allgather, which
-: 953: takes p-1 steps, because it performs better than recursive doubling.
-: 954: Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
-: 955:
-: 956: Possible improvements:
-: 957: For clusters of SMPs, we may want to do something differently to
-: 958: take advantage of shared memory on each node.
-: 959:
-: 960: End Algorithm: MPI_Bcast
-: 961:*/
-: 962:
-: 963:#undef FUNCNAME
-: 964:#define FUNCNAME MPIR_Bcast
-: 965:#undef FCNAME
-: 966:#define FCNAME MPIU_QUOTE(FUNCNAME)
-: 967:/* not declared static because it is called in intercomm. allgatherv */
-: 968:int MPIR_Bcast (
-: 969: void *buffer,
-: 970: int count,
-: 971: MPI_Datatype datatype,
-: 972: int root,
-: 973: MPID_Comm *comm_ptr )
|
2054590: 974:{
2054590: 975: int mpi_errno = MPI_SUCCESS;
-: 976: int comm_size;
2054590: 977: int nbytes=0;
-: 978: int type_size, is_homogeneous;
-: 979: MPI_Comm comm;
2054590: 980: MPIU_THREADPRIV_DECL;
-: 981: MPID_MPI_STATE_DECL(MPID_STATE_MPIR_BCAST);
-: 982:
-: 983: MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_BCAST);
-: 984:
-: 985: /* The various MPIR_Bcast_* impls use NMPI functions, so we bump the nest
-: 986: count here to avoid repeatedly calling incr/decr. */
2054590: 987: MPIU_THREADPRIV_GET;
2054590: 988: MPIR_Nest_incr();
-: 989:
-: 990: /* check if multiple threads are calling this collective function */
-: 991: MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
-: 992:
2054590: 993: if (count == 0) goto fn_exit;
-: 994:
-: 995:#if defined(USE_SMP_COLLECTIVES)
2054254: 996: if (MPIR_Comm_is_node_aware(comm_ptr)) {
128232: 997: mpi_errno = MPIR_SMP_Bcast(buffer, count, datatype, root, comm_ptr);
|
128232: 998: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 999: goto fn_exit;
-: 1000: }
-: 1001:#endif
-: 1002:
|
1926022: 1003: comm = comm_ptr->handle;
1926022: 1004: comm_size = comm_ptr->local_size;
-: 1005:
1926022: 1006: is_homogeneous = 1;
-: 1007:#ifdef MPID_HAS_HETERO
-: 1008: if (comm_ptr->is_hetero)
-: 1009: is_homogeneous = 0;
-: 1010:#endif
-: 1011:
-: 1012: /* MPI_Type_size() might not give the accurate size of the packed
-: 1013: * datatype for heterogeneous systems (because of padding, encoding,
-: 1014: * etc). On the other hand, MPI_Pack_size() can become very
-: 1015: * expensive, depending on the implementation, especially for
-: 1016: * heterogeneous systems. We want to use MPI_Type_size() wherever
-: 1017: * possible, and MPI_Pack_size() in other places.
-: 1018: */
1926022: 1019: if (is_homogeneous) {
1926022: 1020: MPID_Datatype_get_size_macro(datatype, type_size);
-: 1021: }
-: 1022: else {
|
#####: 1023: mpi_errno = NMPI_Pack_size(1, datatype, comm, &type_size);
|
#####: 1024: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 1025: }
|
1926022: 1026: nbytes = type_size * count;
-: 1027:
1926022: 1028: if ((nbytes < MPIR_BCAST_SHORT_MSG) || (comm_size < MPIR_BCAST_MIN_PROCS))
-: 1029: {
1926022: 1030: mpi_errno = MPIR_Bcast_binomial(buffer, count, datatype, root, comm_ptr);
|
1926022: 1031: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 1032: }
-: 1033: else /* (nbytes >= MPIR_BCAST_SHORT_MSG) && (comm_size >= MPIR_BCAST_MIN_PROCS) */
-: 1034: {
|
#####: 1035: if ((nbytes < MPIR_BCAST_LONG_MSG) && (MPIU_is_pof2(comm_size, NULL)))
-: 1036: {
#####: 1037: mpi_errno = MPIR_Bcast_scatter_doubling_allgather(buffer, count, datatype, root, comm_ptr);
|
#####: 1038: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 1039: }
-: 1040: else /* (nbytes >= MPIR_BCAST_LONG_MSG) || !(comm_size_is_pof2) */
-: 1041: {
-: 1042: /* We want the ring algorithm whether or not we have a
-: 1043: topologically aware communicator. Doing inter/intra-node
-: 1044: communication phases breaks the pipelining of the algorithm. */
|
#####: 1045: mpi_errno = MPIR_Bcast_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr);
|
#####: 1046: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 1047: }
-: 1048: }
-: 1049:
|
2054590: 1050:fn_exit:
2054590: 1051: MPIR_Nest_decr();
-: 1052:
-: 1053: /* check if multiple threads are calling this collective function */
-: 1054: MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
-: 1055:
|
-: 1056: MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_BCAST);
-: 1057:
|
2054590: 1058: return mpi_errno;
|
-: 1059:fn_fail:
-: 1060: goto fn_exit;
-: 1061:}
-: 1062:
-: 1063:/* A simple utility function to that calls the comm_ptr->coll_fns->Bcast
-: 1064: override if it exists or else it calls MPIR_Bcast with the same arguments.
-: 1065: This function just makes the high-level broadcast logic easier to read while
-: 1066: still accomodating coll_fns-style overrides. It also reduces future errors
-: 1067: by eliminating the duplication of Bcast arguments.
-: 1068:
-: 1069: This routine is used in other files as well (barrier.c, allreduce.c)
-: 1070:
-: 1071: TODO This function should be deprecated in favor of a direct call to
-: 1072: MPIR_Bcast now that we handle SMP-awareness inside of MPIR_Bcast instead
-: 1073: of MPI_Bcast.
-: 1074:*/
-: 1075:#undef FUNCNAME
-: 1076:#define FUNCNAME MPIR_Bcast_or_coll_fn
-: 1077:#undef FCNAME
-: 1078:#define FCNAME MPIU_QUOTE(FUNCNAME)
-: 1079:int MPIR_Bcast_or_coll_fn(void *buffer,
-: 1080: int count,
-: 1081: MPI_Datatype datatype,
-: 1082: int root,
-: 1083: MPID_Comm *comm_ptr)
|
1912590: 1084:{
1912590: 1085: int mpi_errno = MPI_SUCCESS;
-: 1086:
1912590: 1087: if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Bcast != NULL)
-: 1088: {
|
-: 1089: /* --BEGIN USEREXTENSION-- */
#####: 1090: mpi_errno = comm_ptr->node_roots_comm->coll_fns->Bcast(buffer, count,
-: 1091: datatype, root, comm_ptr);
-: 1092: /* --END USEREXTENSION-- */
-: 1093: }
-: 1094: else {
|
1912590: 1095: mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr);
-: 1096: }
-: 1097:
1912590: 1098: return mpi_errno;
-: 1099:}
-: 1100:
-: 1101:#undef FUNCNAME
-: 1102:#define FUNCNAME MPIR_Bcast_inter
-: 1103:#undef FCNAME
-: 1104:#define FCNAME MPIU_QUOTE(FUNCNAME)
-: 1105:/* begin:nested */
-: 1106:/* Not PMPI_LOCAL because it is called in intercomm allgather */
-: 1107:int MPIR_Bcast_inter (
-: 1108: void *buffer,
-: 1109: int count,
-: 1110: MPI_Datatype datatype,
-: 1111: int root,
-: 1112: MPID_Comm *comm_ptr )
7182: 1113:{
-: 1114:/* Intercommunicator broadcast.
-: 1115: Root sends to rank 0 in remote group. Remote group does local
-: 1116: intracommunicator broadcast.
-: 1117:*/
-: 1118: int rank, mpi_errno;
-: 1119: MPI_Status status;
7182: 1120: MPID_Comm *newcomm_ptr = NULL;
-: 1121: MPI_Comm comm;
-: 1122: MPID_MPI_STATE_DECL(MPID_STATE_MPIR_BCAST_INTER);
-: 1123:
-: 1124: MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_BCAST_INTER);
-: 1125:
7182: 1126: comm = comm_ptr->handle;
-: 1127:
7182: 1128: if (root == MPI_PROC_NULL)
-: 1129: {
-: 1130: /* local processes other than root do nothing */
2343: 1131: mpi_errno = MPI_SUCCESS;
-: 1132: }
4839: 1133: else if (root == MPI_ROOT)
-: 1134: {
-: 1135: /* root sends to rank 0 on remote group and returns */
-: 1136: MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
1196: 1137: mpi_errno = MPIC_Send(buffer, count, datatype, 0,
-: 1138: MPIR_BCAST_TAG, comm);
|
1196: 1139: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 1140: MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
-: 1141: }
-: 1142: else
-: 1143: {
-: 1144: /* remote group. rank 0 on remote group receives from root */
-: 1145:
|
3643: 1146: rank = comm_ptr->rank;
-: 1147:
3643: 1148: if (rank == 0)
-: 1149: {
1196: 1150: mpi_errno = MPIC_Recv(buffer, count, datatype, root,
-: 1151: MPIR_BCAST_TAG, comm, &status);
|
1196: 1152: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 1153: }
-: 1154:
-: 1155: /* Get the local intracommunicator */
|
3643: 1156: if (!comm_ptr->local_comm)
97: 1157: MPIR_Setup_intercomm_localcomm( comm_ptr );
-: 1158:
3643: 1159: newcomm_ptr = comm_ptr->local_comm;
-: 1160:
-: 1161: /* now do the usual broadcast on this intracommunicator
-: 1162: with rank 0 as root. */
3643: 1163: mpi_errno = MPIR_Bcast(buffer, count, datatype, 0, newcomm_ptr);
|
3643: 1164: if (mpi_errno) MPIU_ERR_POP(mpi_errno);
-: 1165: }
-: 1166:
7182: 1167:fn_fail:
-: 1168: MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_BCAST_INTER);
|
7182: 1169: return mpi_errno;
-: 1170:}
-: 1171:/* end:nested */
-: 1172:#endif /* MPICH_MPI_FROM_PMPI */
-: 1173:
-: 1174:#undef FUNCNAME
-: 1175:#define FUNCNAME MPI_Bcast
-: 1176:#undef FCNAME
-: 1177:#define FCNAME MPIU_QUOTE(FUNCNAME)
-: 1178:
-: 1179:/*@
-: 1180:MPI_Bcast - Broadcasts a message from the process with rank "root" to
-: 1181: all other processes of the communicator
-: 1182:
-: 1183:Input/Output Parameter:
-: 1184:. buffer - starting address of buffer (choice)
-: 1185:
-: 1186:Input Parameters:
-: 1187:+ count - number of entries in buffer (integer)
-: 1188:. datatype - data type of buffer (handle)
-: 1189:. root - rank of broadcast root (integer)
-: 1190:- comm - communicator (handle)
-: 1191:
-: 1192:.N ThreadSafe
-: 1193:
-: 1194:.N Fortran
-: 1195:
-: 1196:.N Errors
-: 1197:.N MPI_SUCCESS
-: 1198:.N MPI_ERR_COMM
-: 1199:.N MPI_ERR_COUNT
-: 1200:.N MPI_ERR_TYPE
-: 1201:.N MPI_ERR_BUFFER
-: 1202:.N MPI_ERR_ROOT
-: 1203:@*/
-: 1204:int MPI_Bcast( void *buffer, int count, MPI_Datatype datatype, int root,
-: 1205: MPI_Comm comm )
123941: 1206:{
123941: 1207: int mpi_errno = MPI_SUCCESS;
123941: 1208: MPID_Comm *comm_ptr = NULL;
123941: 1209: MPIU_THREADPRIV_DECL;
-: 1210: MPID_MPI_STATE_DECL(MPID_STATE_MPI_BCAST);
-: 1211:
123941: 1212: MPIR_ERRTEST_INITIALIZED_ORDIE();
-: 1213:
123941: 1214: MPIU_THREAD_CS_ENTER(ALLFUNC,);
-: 1215: MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_BCAST);
-: 1216:
-: 1217: /* Validate parameters, especially handles needing to be converted */
|
-: 1218:# ifdef HAVE_ERROR_CHECKING
-: 1219: {
-: 1220: MPID_BEGIN_ERROR_CHECKS;
-: 1221: {
123941: 1222: MPIR_ERRTEST_COMM(comm, mpi_errno);
123941: 1223: if (mpi_errno != MPI_SUCCESS) goto fn_fail;
-: 1224: }
-: 1225: MPID_END_ERROR_CHECKS;
-: 1226: }
-: 1227:# endif /* HAVE_ERROR_CHECKING */
-: 1228:
-: 1229: /* Convert MPI object handles to object pointers */
|
123939: 1230: MPID_Comm_get_ptr( comm, comm_ptr );
-: 1231:
-: 1232: /* Validate parameters and objects (post conversion) */
|
-: 1233:# ifdef HAVE_ERROR_CHECKING
-: 1234: {
-: 1235: MPID_BEGIN_ERROR_CHECKS;
-: 1236: {
123939: 1237: MPID_Datatype *datatype_ptr = NULL;
-: 1238:
123939: 1239: MPID_Comm_valid_ptr( comm_ptr, mpi_errno );
123939: 1240: if (mpi_errno != MPI_SUCCESS) goto fn_fail;
123937: 1241: MPIR_ERRTEST_COUNT(count, mpi_errno);
123937: 1242: MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno);
123937: 1243: if (comm_ptr->comm_kind == MPID_INTRACOMM) {
121833: 1244: MPIR_ERRTEST_INTRA_ROOT(comm_ptr, root, mpi_errno);
-: 1245: }
-: 1246: else {
2104: 1247: MPIR_ERRTEST_INTER_ROOT(comm_ptr, root, mpi_errno);
-: 1248: }
-: 1249:
123937: 1250: if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
33240: 1251: MPID_Datatype_get_ptr(datatype, datatype_ptr);
33240: 1252: MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno );
33240: 1253: MPID_Datatype_committed_ptr( datatype_ptr, mpi_errno );
-: 1254: }
-: 1255:
123937: 1256: MPIR_ERRTEST_BUF_INPLACE(buffer, count, mpi_errno);
123937: 1257: MPIR_ERRTEST_USERBUFFER(buffer,count,datatype,mpi_errno);
-: 1258:
123937: 1259: if (mpi_errno != MPI_SUCCESS) goto fn_fail;
-: 1260: }
-: 1261: MPID_END_ERROR_CHECKS;
-: 1262: }
-: 1263:# endif /* HAVE_ERROR_CHECKING */
-: 1264:
-: 1265: /* ... body of routine ... */
-: 1266:
|
123929: 1267: if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Bcast != NULL)
-: 1268: {
|
-: 1269: /* --BEGIN USEREXTENSION-- */
#####: 1270: mpi_errno = comm_ptr->coll_fns->Bcast(buffer, count,
-: 1271: datatype, root, comm_ptr);
-: 1272: /* --END USEREXTENSION-- */
-: 1273: }
-: 1274: else
-: 1275: {
|
123929: 1276: if (comm_ptr->comm_kind == MPID_INTRACOMM)
-: 1277: {
-: 1278: /* intracommunicator */
121825: 1279: mpi_errno = MPIR_Bcast( buffer, count, datatype, root, comm_ptr );
-: 1280: }
-: 1281: else
-: 1282: {
-: 1283: /* intercommunicator */
2104: 1284: mpi_errno = MPIR_Bcast_inter( buffer, count, datatype, root, comm_ptr );
-: 1285: }
-: 1286: }
-: 1287:
|
123929: 1288: if (mpi_errno != MPI_SUCCESS) goto fn_fail;
-: 1289:
-: 1290: /* ... end of body of routine ... */
-: 1291:
|
123941: 1292: fn_exit:
|
-: 1293: MPID_MPI_COLL_FUNC_EXIT(MPID_STATE_MPI_BCAST);
|
123941: 1294: MPIU_THREAD_CS_EXIT(ALLFUNC,);
123941: 1295: return mpi_errno;
-: 1296:
|
12: 1297: fn_fail:
-: 1298: /* --BEGIN ERROR HANDLING-- */
-: 1299:# ifdef HAVE_ERROR_CHECKING
-: 1300: {
12: 1301: mpi_errno = MPIR_Err_create_code(
-: 1302: mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER,
-: 1303: "**mpi_bcast",
-: 1304: "**mpi_bcast %p %d %D %d %C", buffer, count, datatype, root, comm);
-: 1305: }
-: 1306:# endif
12: 1307: mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno );
12: 1308: goto fn_exit;
-: 1309: /* --END ERROR HANDLING-- */
-: 1310:}
|