-:    0:Source:/home/MPI/testing/mpich2/mpich2/src/mpi/coll/bcast.c
        -:    0:Graph:bcast.gcno
        -:    0:Data:bcast.gcda
        -:    0:Runs:4383
        -:    0:Programs:1376
        -:    1:/* -*- Mode: C; c-basic-offset:4 ; -*- */
        -:    2:/*
        -:    3: *
        -:    4: *  (C) 2001 by Argonne National Laboratory.
        -:    5: *      See COPYRIGHT in top-level directory.
        -:    6: */
        -:    7:
        -:    8:#include "mpiimpl.h"
        -:    9:
        -:   10:/* -- Begin Profiling Symbol Block for routine MPI_Bcast */
        -:   11:#if defined(HAVE_PRAGMA_WEAK)
        -:   12:#pragma weak MPI_Bcast = PMPI_Bcast
        -:   13:#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
        -:   14:#pragma _HP_SECONDARY_DEF PMPI_Bcast  MPI_Bcast
        -:   15:#elif defined(HAVE_PRAGMA_CRI_DUP)
        -:   16:#pragma _CRI duplicate MPI_Bcast as PMPI_Bcast
        -:   17:#endif
        -:   18:/* -- End Profiling Symbol Block */
        -:   19:
        -:   20:/* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
        -:   21:   the MPI routines */
        -:   22:#ifndef MPICH_MPI_FROM_PMPI
        -:   23:#undef MPI_Bcast
        -:   24:#define MPI_Bcast PMPI_Bcast
        -:   25:
        -:   26:/* FIXME move to somewhere else */
        -:   27:/* Returns non-zero if val is a power of two.  If ceil_pof2 is non-NULL, it sets
        -:   28:   *ceil_pof2 to the power of two that is just larger than or equal to val.
        -:   29:   That is, it rounds up to the nearest power of two. */
        -:   30:static inline int MPIU_is_pof2(int val, int *ceil_pof2)
        -:   31:{
    23750:   32:    int pof2 = 1;
        -:   33:
   113662:   34:    while (pof2 < val)
    89912:   35:        pof2 *= 2;
    23750:   36:    if (ceil_pof2)
    #####:   37:        *ceil_pof2 = pof2;
        -:   38:
    23750:   39:    if (pof2 == val)
     3816:   40:        return 1;
        -:   41:    else
    19934:   42:        return 0;
        -:   43:}
        -:   44:
        -:   45:
        -:   46:/* A binomial tree broadcast algorithm.  Good for short messages, 
        -:   47:   Cost = lgp.alpha + n.lgp.beta */
        -:   48:#undef FUNCNAME
        -:   49:#define FUNCNAME MPIR_Bcast_binomial
        -:   50:#undef FCNAME
        -:   51:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:   52:static int MPIR_Bcast_binomial(
        -:   53:    void *buffer, 
        -:   54:    int count, 
        -:   55:    MPI_Datatype datatype, 
        -:   56:    int root, 
        -:   57:    MPID_Comm *comm_ptr)
  2052386:   58:{
        -:   59:    int        rank, comm_size, src, dst;
        -:   60:    int        relative_rank, mask;
  2052386:   61:    int        mpi_errno = MPI_SUCCESS;
  2052386:   62:    int nbytes=0;
        -:   63:    int type_size, is_contig, is_homogeneous;
        -:   64:    int position;
  2052386:   65:    void *tmp_buf=NULL;
        -:   66:    MPI_Comm comm;
        -:   67:    MPID_Datatype *dtp;
  2052386:   68:    MPIU_CHKLMEM_DECL(1);
        -:   69:
  2052386:   70:    comm = comm_ptr->handle;
  2052386:   71:    comm_size = comm_ptr->local_size;
  2052386:   72:    rank = comm_ptr->rank;
        -:   73:
        -:   74:    /* If there is only one process, return */
  2052386:   75:    if (comm_size == 1) goto fn_exit;
        -:   76:
  2022389:   77:    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)
  1993755:   78:        is_contig = 1;
        -:   79:    else {
    28634:   80:        MPID_Datatype_get_ptr(datatype, dtp); 
    28634:   81:        is_contig = dtp->is_contig;
        -:   82:    }
        -:   83:
  2022389:   84:    is_homogeneous = 1;
        -:   85:#ifdef MPID_HAS_HETERO
        -:   86:    if (comm_ptr->is_hetero)
        -:   87:        is_homogeneous = 0;
        -:   88:#endif
        -:   89:
        -:   90:    /* MPI_Type_size() might not give the accurate size of the packed
        -:   91:     * datatype for heterogeneous systems (because of padding, encoding,
        -:   92:     * etc). On the other hand, MPI_Pack_size() can become very
        -:   93:     * expensive, depending on the implementation, especially for
        -:   94:     * heterogeneous systems. We want to use MPI_Type_size() wherever
        -:   95:     * possible, and MPI_Pack_size() in other places.
        -:   96:     */
  2022389:   97:    if (is_homogeneous) {
  2022389:   98:        MPID_Datatype_get_size_macro(datatype, type_size);
        -:   99:    }
        -:  100:    else {
    #####:  101:        mpi_errno = NMPI_Pack_size(1, datatype, comm, &type_size);
    #####:  102:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  103:    }
  2022389:  104:    nbytes = type_size * count;
        -:  105:
  2022389:  106:    if (!is_contig || !is_homogeneous)
        -:  107:    {
     8826:  108:        MPIU_CHKLMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf");
        -:  109:
        -:  110:        /* TODO: Pipeline the packing and communication */
     8826:  111:        position = 0;
     8826:  112:        if (rank == root)
     1553:  113:            NMPI_Pack(buffer, count, datatype, tmp_buf, nbytes,
        -:  114:                      &position, comm);
        -:  115:    }
        -:  116:
  2022389:  117:    relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
        -:  118:
        -:  119:    /* Use short message algorithm, namely, binomial tree */
        -:  120:
        -:  121:    /* Algorithm:
        -:  122:       This uses a fairly basic recursive subdivision algorithm.
        -:  123:       The root sends to the process comm_size/2 away; the receiver becomes
        -:  124:       a root for a subtree and applies the same process. 
        -:  125:
        -:  126:       So that the new root can easily identify the size of its
        -:  127:       subtree, the (subtree) roots are all powers of two (relative
        -:  128:       to the root) If m = the first power of 2 such that 2^m >= the
        -:  129:       size of the communicator, then the subtree at root at 2^(m-k)
        -:  130:       has size 2^k (with special handling for subtrees that aren't
        -:  131:       a power of two in size).
        -:  132:
        -:  133:       Do subdivision.  There are two phases:
        -:  134:       1. Wait for arrival of data.  Because of the power of two nature
        -:  135:       of the subtree roots, the source of this message is alwyas the
        -:  136:       process whose relative rank has the least significant 1 bit CLEARED.
        -:  137:       That is, process 4 (100) receives from process 0, process 7 (111) 
        -:  138:       from process 6 (110), etc.   
        -:  139:       2. Forward to my subtree
        -:  140:
        -:  141:       Note that the process that is the tree root is handled automatically
        -:  142:       by this code, since it has no bits set.  */
        -:  143:
  2022389:  144:    mask = 0x1;
  5547079:  145:    while (mask < comm_size)
        -:  146:    {
  2836102:  147:        if (relative_rank & mask)
        -:  148:        {
  1333801:  149:            src = rank - mask; 
  1333801:  150:            if (src < 0) src += comm_size;
  1333801:  151:            if (!is_contig || !is_homogeneous)
     7273:  152:                mpi_errno = MPIC_Recv(tmp_buf,nbytes,MPI_BYTE,src,
        -:  153:                                      MPIR_BCAST_TAG,comm,MPI_STATUS_IGNORE);
        -:  154:            else
  1326528:  155:                mpi_errno = MPIC_Recv(buffer,count,datatype,src,
        -:  156:                                      MPIR_BCAST_TAG,comm,MPI_STATUS_IGNORE);
  1333801:  157:            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  158:            break;
        -:  159:        }
  1502301:  160:        mask <<= 1;
        -:  161:    }
        -:  162:
        -:  163:    /* This process is responsible for all processes that have bits
        -:  164:       set from the LSB upto (but not including) mask.  Because of
        -:  165:       the "not including", we start by shifting mask back down one.
        -:  166:
        -:  167:       We can easily change to a different algorithm at any power of two
        -:  168:       by changing the test (mask > 1) to (mask > block_size) 
        -:  169:
        -:  170:       One such version would use non-blocking operations for the last 2-4
        -:  171:       steps (this also bounds the number of MPI_Requests that would
        -:  172:       be needed).  */
        -:  173:
  2022388:  174:    mask >>= 1;
  5547077:  175:    while (mask > 0)
        -:  176:    {
  1502301:  177:        if (relative_rank + mask < comm_size)
        -:  178:        {
  1333800:  179:            dst = rank + mask;
  1333800:  180:            if (dst >= comm_size) dst -= comm_size;
  1333800:  181:            if (!is_contig || !is_homogeneous)
     7553:  182:                mpi_errno = MPIC_Send(tmp_buf,nbytes,MPI_BYTE,dst,
        -:  183:                                      MPIR_BCAST_TAG,comm);
        -:  184:            else
  1326247:  185:                mpi_errno = MPIC_Send(buffer,count,datatype,dst,
        -:  186:                                      MPIR_BCAST_TAG,comm); 
  1333800:  187:            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  188:        }
  1502301:  189:        mask >>= 1;
        -:  190:    }
        -:  191:
  2022388:  192:    if (!is_contig || !is_homogeneous)
        -:  193:    {
     8826:  194:        if (rank != root)
        -:  195:        {
     7273:  196:            position = 0;
     7273:  197:            NMPI_Unpack(tmp_buf, nbytes, &position, buffer, count,
        -:  198:                        datatype, comm);
        -:  199:        }
        -:  200:    }
        -:  201:
        -:  202:fn_exit:
     8826:  203:    MPIU_CHKLMEM_FREEALL();
  2052386:  204:    return mpi_errno;
        -:  205:fn_fail:
        -:  206:    goto fn_exit;
        -:  207:}
        -:  208:
        -:  209:/* FIXME it would be nice if we could refactor things to minimize
        -:  210:   duplication between this and MPIR_Scatter and friends.  We can't use
        -:  211:   MPIR_Scatter as is without inducing an extra copy in the noncontig case. */
        -:  212:/* There are additional arguments included here that are unused because we
        -:  213:   always assume that the noncontig case has been packed into a contig case by
        -:  214:   the caller for now.  Once we start handling noncontig data at the upper level
        -:  215:   we can start handling it here.
        -:  216:   
        -:  217:   At the moment this function always scatters a buffer of nbytes starting at
        -:  218:   tmp_buf address. */
        -:  219:#undef FUNCNAME
        -:  220:#define FUNCNAME scatter_for_bcast
        -:  221:#undef FCNAME
        -:  222:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  223:static int scatter_for_bcast(
        -:  224:    void *buffer ATTRIBUTE((unused)),
        -:  225:    int count ATTRIBUTE((unused)), 
        -:  226:    MPI_Datatype datatype ATTRIBUTE((unused)),
        -:  227:    int root,
        -:  228:    MPID_Comm *comm_ptr,
        -:  229:    int nbytes,
        -:  230:    void *tmp_buf,
        -:  231:    int is_contig,
        -:  232:    int is_homogeneous)
    22654:  233:{
        -:  234:    MPI_Status status;
        -:  235:    int        rank, comm_size, src, dst;
        -:  236:    int        relative_rank, mask;
    22654:  237:    int        mpi_errno = MPI_SUCCESS;
    22654:  238:    int scatter_size, curr_size, recv_size = 0, send_size;
        -:  239:    MPI_Comm comm;
        -:  240:
    22654:  241:    comm = comm_ptr->handle;
    22654:  242:    comm_size = comm_ptr->local_size;
    22654:  243:    rank = comm_ptr->rank;
    22654:  244:    relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
        -:  245:
        -:  246:    /* use long message algorithm: binomial tree scatter followed by an allgather */
        -:  247:
        -:  248:    /* The scatter algorithm divides the buffer into nprocs pieces and
        -:  249:       scatters them among the processes. Root gets the first piece,
        -:  250:       root+1 gets the second piece, and so forth. Uses the same binomial
        -:  251:       tree algorithm as above. Ceiling division
        -:  252:       is used to compute the size of each piece. This means some
        -:  253:       processes may not get any data. For example if bufsize = 97 and
        -:  254:       nprocs = 16, ranks 15 and 16 will get 0 data. On each process, the
        -:  255:       scattered data is stored at the same offset in the buffer as it is
        -:  256:       on the root process. */ 
        -:  257:
    22654:  258:    scatter_size = (nbytes + comm_size - 1)/comm_size; /* ceiling division */
    22654:  259:    curr_size = (rank == root) ? nbytes : 0; /* root starts with all the
        -:  260:                                                data */
        -:  261:
    22654:  262:    mask = 0x1;
    70724:  263:    while (mask < comm_size)
        -:  264:    {
    45742:  265:        if (relative_rank & mask)
        -:  266:        {
    20326:  267:            src = rank - mask; 
    20326:  268:            if (src < 0) src += comm_size;
    20326:  269:            recv_size = nbytes - relative_rank*scatter_size;
        -:  270:            /* recv_size is larger than what might actually be sent by the
        -:  271:               sender. We don't need compute the exact value because MPI
        -:  272:               allows you to post a larger recv.*/ 
    20326:  273:            if (recv_size <= 0)
        -:  274:            {
    #####:  275:                curr_size = 0; /* this process doesn't receive any data
        -:  276:                                  because of uneven division */
        -:  277:            }
        -:  278:            else
        -:  279:            {
    20326:  280:                mpi_errno = MPIC_Recv(((char *)tmp_buf +
        -:  281:                                       relative_rank*scatter_size),
        -:  282:                                      recv_size, MPI_BYTE, src,
        -:  283:                                      MPIR_BCAST_TAG, comm, &status);
    20326:  284:                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  285:
        -:  286:                /* query actual size of data received */
    20326:  287:                NMPI_Get_count(&status, MPI_BYTE, &curr_size);
        -:  288:            }
        -:  289:            break;
        -:  290:        }
    25416:  291:        mask <<= 1;
        -:  292:    }
        -:  293:
        -:  294:    /* This process is responsible for all processes that have bits
        -:  295:       set from the LSB upto (but not including) mask.  Because of
        -:  296:       the "not including", we start by shifting mask back down
        -:  297:       one. */
        -:  298:
    22654:  299:    mask >>= 1;
    70724:  300:    while (mask > 0)
        -:  301:    {
    25416:  302:        if (relative_rank + mask < comm_size)
        -:  303:        {
    20326:  304:            send_size = curr_size - scatter_size * mask; 
        -:  305:            /* mask is also the size of this process's subtree */
        -:  306:
    20326:  307:            if (send_size > 0)
        -:  308:            {
    20326:  309:                dst = rank + mask;
    20326:  310:                if (dst >= comm_size) dst -= comm_size;
    20326:  311:                mpi_errno = MPIC_Send (((char *)tmp_buf +
        -:  312:                                        scatter_size*(relative_rank+mask)),
        -:  313:                                       send_size, MPI_BYTE, dst,
        -:  314:                                       MPIR_BCAST_TAG, comm);
    20326:  315:                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  316:
    20326:  317:                curr_size -= send_size;
        -:  318:            }
        -:  319:        }
    25416:  320:        mask >>= 1;
        -:  321:    }
        -:  322:
    22654:  323:fn_exit:
    22654:  324:    return mpi_errno;
        -:  325:fn_fail:
        -:  326:    goto fn_exit;
        -:  327:}
        -:  328:
        -:  329:/*
        -:  330:   Broadcast based on a scatter followed by an allgather.
        -:  331:
        -:  332:   We first scatter the buffer using a binomial tree algorithm. This costs
        -:  333:   lgp.alpha + n.((p-1)/p).beta
        -:  334:   If the datatype is contiguous and the communicator is homogeneous,
        -:  335:   we treat the data as bytes and divide (scatter) it among processes
        -:  336:   by using ceiling division. For the noncontiguous or heterogeneous
        -:  337:   cases, we first pack the data into a temporary buffer by using
        -:  338:   MPI_Pack, scatter it as bytes, and unpack it after the allgather.
        -:  339:
        -:  340:   For the allgather, we use a recursive doubling algorithm for 
        -:  341:   medium-size messages and power-of-two number of processes. This
        -:  342:   takes lgp steps. In each step pairs of processes exchange all the
        -:  343:   data they have (we take care of non-power-of-two situations). This
        -:  344:   costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
        -:  345:   because it may be slightly more in the non-power-of-two case, but
        -:  346:   it's still a logarithmic algorithm.) Therefore, for long messages
        -:  347:   Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
        -:  348:*/
        -:  349:#undef FUNCNAME
        -:  350:#define FUNCNAME MPIR_Bcast_scatter_doubling_allgather
        -:  351:#undef FCNAME
        -:  352:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  353:static int MPIR_Bcast_scatter_doubling_allgather(
        -:  354:    void *buffer, 
        -:  355:    int count, 
        -:  356:    MPI_Datatype datatype, 
        -:  357:    int root, 
        -:  358:    MPID_Comm *comm_ptr)
      424:  359:{
        -:  360:    MPI_Status status;
        -:  361:    int rank, comm_size, dst;
        -:  362:    int relative_rank, mask;
      424:  363:    int mpi_errno = MPI_SUCCESS;
      424:  364:    int scatter_size, nbytes=0, curr_size, recv_size = 0;
        -:  365:    int type_size, j, k, i, tmp_mask, is_contig, is_homogeneous;
        -:  366:    int relative_dst, dst_tree_root, my_tree_root, send_offset;
        -:  367:    int recv_offset, tree_root, nprocs_completed, offset, position;
      424:  368:    MPIU_CHKLMEM_DECL(1);
        -:  369:    MPI_Comm comm;
        -:  370:    MPID_Datatype *dtp;
        -:  371:    MPI_Aint true_extent, true_lb;
        -:  372:    void *tmp_buf;
        -:  373:
      424:  374:    comm = comm_ptr->handle;
      424:  375:    comm_size = comm_ptr->local_size;
      424:  376:    rank = comm_ptr->rank;
      424:  377:    relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
        -:  378:
        -:  379:    /* If there is only one process, return */
      424:  380:    if (comm_size == 1) goto fn_exit;
        -:  381:
    #####:  382:    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)
    #####:  383:        is_contig = 1;
        -:  384:    else {
    #####:  385:        MPID_Datatype_get_ptr(datatype, dtp); 
    #####:  386:        is_contig = dtp->is_contig;
        -:  387:    }
        -:  388:
    #####:  389:    is_homogeneous = 1;
        -:  390:#ifdef MPID_HAS_HETERO
        -:  391:    if (comm_ptr->is_hetero)
        -:  392:        is_homogeneous = 0;
        -:  393:#endif
        -:  394:
        -:  395:    /* MPI_Type_size() might not give the accurate size of the packed
        -:  396:     * datatype for heterogeneous systems (because of padding, encoding,
        -:  397:     * etc). On the other hand, MPI_Pack_size() can become very
        -:  398:     * expensive, depending on the implementation, especially for
        -:  399:     * heterogeneous systems. We want to use MPI_Type_size() wherever
        -:  400:     * possible, and MPI_Pack_size() in other places.
        -:  401:     */
    #####:  402:    if (is_homogeneous) {
    #####:  403:        MPID_Datatype_get_size_macro(datatype, type_size);
        -:  404:    }
        -:  405:    else {
    #####:  406:        mpi_errno = NMPI_Pack_size(1, datatype, comm, &type_size);
    #####:  407:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  408:    }
    #####:  409:    nbytes = type_size * count;
        -:  410:
    #####:  411:    if (is_contig && is_homogeneous)
        -:  412:    {
        -:  413:        /* contiguous and homogeneous. no need to pack. */
    #####:  414:        mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb,
        -:  415:                                              &true_extent);
    #####:  416:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  417:
    #####:  418:        tmp_buf = (char *) buffer + true_lb;
        -:  419:    }
        -:  420:    else
        -:  421:    {
    #####:  422:        MPIU_CHKLMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf");
        -:  423:
        -:  424:        /* TODO: Pipeline the packing and communication */
    #####:  425:        position = 0;
    #####:  426:        if (rank == root)
    #####:  427:            NMPI_Pack(buffer, count, datatype, tmp_buf, nbytes,
        -:  428:                      &position, comm);
        -:  429:    }
        -:  430:
        -:  431:
    #####:  432:    scatter_size = (nbytes + comm_size - 1)/comm_size; /* ceiling division */
    #####:  433:    curr_size = (rank == root) ? nbytes : 0; /* root starts with all the
        -:  434:                                                data */
        -:  435:
        -:  436:
    #####:  437:    mpi_errno = scatter_for_bcast(buffer, count, datatype, root, comm_ptr,
        -:  438:                                  nbytes, tmp_buf, is_contig, is_homogeneous);
    #####:  439:    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  440:
        -:  441:    /* medium size allgather and pof2 comm_size. use recurive doubling. */
        -:  442:
    #####:  443:    mask = 0x1;
    #####:  444:    i = 0;
    #####:  445:    while (mask < comm_size)
        -:  446:    {
    #####:  447:        relative_dst = relative_rank ^ mask;
        -:  448:
    #####:  449:        dst = (relative_dst + root) % comm_size; 
        -:  450:
        -:  451:        /* find offset into send and recv buffers.
        -:  452:           zero out the least significant "i" bits of relative_rank and
        -:  453:           relative_dst to find root of src and dst
        -:  454:           subtrees. Use ranks of roots as index to send from
        -:  455:           and recv into  buffer */ 
        -:  456:
    #####:  457:        dst_tree_root = relative_dst >> i;
    #####:  458:        dst_tree_root <<= i;
        -:  459:
    #####:  460:        my_tree_root = relative_rank >> i;
    #####:  461:        my_tree_root <<= i;
        -:  462:
    #####:  463:        send_offset = my_tree_root * scatter_size;
    #####:  464:        recv_offset = dst_tree_root * scatter_size;
        -:  465:
    #####:  466:        if (relative_dst < comm_size)
        -:  467:        {
    #####:  468:            mpi_errno = MPIC_Sendrecv(((char *)tmp_buf + send_offset),
        -:  469:                                      curr_size, MPI_BYTE, dst, MPIR_BCAST_TAG, 
        -:  470:                                      ((char *)tmp_buf + recv_offset),
        -:  471:                                      (nbytes-recv_offset < 0 ? 0 : nbytes-recv_offset), 
        -:  472:                                      MPI_BYTE, dst, MPIR_BCAST_TAG, comm, &status);
    #####:  473:            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  474:
    #####:  475:            NMPI_Get_count(&status, MPI_BYTE, &recv_size);
    #####:  476:            curr_size += recv_size;
        -:  477:        }
        -:  478:
        -:  479:        /* if some processes in this process's subtree in this step
        -:  480:           did not have any destination process to communicate with
        -:  481:           because of non-power-of-two, we need to send them the
        -:  482:           data that they would normally have received from those
        -:  483:           processes. That is, the haves in this subtree must send to
        -:  484:           the havenots. We use a logarithmic recursive-halfing algorithm
        -:  485:           for this. */
        -:  486:
        -:  487:        /* This part of the code will not currently be
        -:  488:           executed because we are not using recursive
        -:  489:           doubling for non power of two. Mark it as experimental
        -:  490:           so that it doesn't show up as red in the coverage tests. */  
        -:  491:
        -:  492:        /* --BEGIN EXPERIMENTAL-- */
    #####:  493:        if (dst_tree_root + mask > comm_size)
        -:  494:        {
    #####:  495:            nprocs_completed = comm_size - my_tree_root - mask;
        -:  496:            /* nprocs_completed is the number of processes in this
        -:  497:               subtree that have all the data. Send data to others
        -:  498:               in a tree fashion. First find root of current tree
        -:  499:               that is being divided into two. k is the number of
        -:  500:               least-significant bits in this process's rank that
        -:  501:               must be zeroed out to find the rank of the root */ 
    #####:  502:            j = mask;
    #####:  503:            k = 0;
    #####:  504:            while (j)
        -:  505:            {
    #####:  506:                j >>= 1;
    #####:  507:                k++;
        -:  508:            }
    #####:  509:            k--;
        -:  510:
    #####:  511:            offset = scatter_size * (my_tree_root + mask);
    #####:  512:            tmp_mask = mask >> 1;
        -:  513:
    #####:  514:            while (tmp_mask)
        -:  515:            {
    #####:  516:                relative_dst = relative_rank ^ tmp_mask;
    #####:  517:                dst = (relative_dst + root) % comm_size; 
        -:  518:
    #####:  519:                tree_root = relative_rank >> k;
    #####:  520:                tree_root <<= k;
        -:  521:
        -:  522:                /* send only if this proc has data and destination
        -:  523:                   doesn't have data. */
        -:  524:
        -:  525:                /* if (rank == 3) { 
        -:  526:                   printf("rank %d, dst %d, root %d, nprocs_completed %d\n", relative_rank, relative_dst, tree_root, nprocs_completed);
        -:  527:                   fflush(stdout);
        -:  528:                   }*/
        -:  529:
    #####:  530:                if ((relative_dst > relative_rank) && 
        -:  531:                    (relative_rank < tree_root + nprocs_completed)
        -:  532:                    && (relative_dst >= tree_root + nprocs_completed))
        -:  533:                {
        -:  534:
        -:  535:                    /* printf("Rank %d, send to %d, offset %d, size %d\n", rank, dst, offset, recv_size);
        -:  536:                       fflush(stdout); */
    #####:  537:                    mpi_errno = MPIC_Send(((char *)tmp_buf + offset),
        -:  538:                                          recv_size, MPI_BYTE, dst,
        -:  539:                                          MPIR_BCAST_TAG, comm); 
        -:  540:                    /* recv_size was set in the previous
        -:  541:                       receive. that's the amount of data to be
        -:  542:                       sent now. */
    #####:  543:                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  544:                }
        -:  545:                /* recv only if this proc. doesn't have data and sender
        -:  546:                   has data */
    #####:  547:                else if ((relative_dst < relative_rank) && 
        -:  548:                         (relative_dst < tree_root + nprocs_completed) &&
        -:  549:                         (relative_rank >= tree_root + nprocs_completed))
        -:  550:                {
        -:  551:                    /* printf("Rank %d waiting to recv from rank %d\n",
        -:  552:                       relative_rank, dst); */
    #####:  553:                    mpi_errno = MPIC_Recv(((char *)tmp_buf + offset),
        -:  554:                                          nbytes - offset, 
        -:  555:                                          MPI_BYTE, dst, MPIR_BCAST_TAG,
        -:  556:                                          comm, &status); 
        -:  557:                    /* nprocs_completed is also equal to the no. of processes
        -:  558:                       whose data we don't have */
    #####:  559:                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  560:
    #####:  561:                    NMPI_Get_count(&status, MPI_BYTE, &recv_size);
    #####:  562:                    curr_size += recv_size;
        -:  563:                    /* printf("Rank %d, recv from %d, offset %d, size %d\n", rank, dst, offset, recv_size);
        -:  564:                       fflush(stdout);*/
        -:  565:                }
    #####:  566:                tmp_mask >>= 1;
    #####:  567:                k--;
        -:  568:            }
        -:  569:        }
        -:  570:        /* --END EXPERIMENTAL-- */
        -:  571:
    #####:  572:        mask <<= 1;
    #####:  573:        i++;
        -:  574:    }
        -:  575:
    #####:  576:    if (!is_contig || !is_homogeneous)
        -:  577:    {
    #####:  578:        if (rank != root)
        -:  579:        {
    #####:  580:            position = 0;
    #####:  581:            NMPI_Unpack(tmp_buf, nbytes, &position, buffer, count,
        -:  582:                        datatype, comm);
        -:  583:        }
        -:  584:    }
        -:  585:
        -:  586:fn_exit:
    #####:  587:    MPIU_CHKLMEM_FREEALL();
      424:  588:    return mpi_errno;
        -:  589:fn_fail:
        -:  590:    goto fn_exit;
        -:  591:}
        -:  592:
        -:  593:/*
        -:  594:   Broadcast based on a scatter followed by an allgather.
        -:  595:
        -:  596:   We first scatter the buffer using a binomial tree algorithm. This costs
        -:  597:   lgp.alpha + n.((p-1)/p).beta
        -:  598:   If the datatype is contiguous and the communicator is homogeneous,
        -:  599:   we treat the data as bytes and divide (scatter) it among processes
        -:  600:   by using ceiling division. For the noncontiguous or heterogeneous
        -:  601:   cases, we first pack the data into a temporary buffer by using
        -:  602:   MPI_Pack, scatter it as bytes, and unpack it after the allgather.
        -:  603:
        -:  604:   We use a ring algorithm for the allgather, which takes p-1 steps.
        -:  605:   This may perform better than recursive doubling for long messages and
        -:  606:   medium-sized non-power-of-two messages.
        -:  607:   Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
        -:  608:*/
        -:  609:#undef FUNCNAME
        -:  610:#define FUNCNAME MPIR_Bcast_scatter_ring_allgather
        -:  611:#undef FCNAME
        -:  612:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  613:static int MPIR_Bcast_scatter_ring_allgather(
        -:  614:    void *buffer, 
        -:  615:    int count, 
        -:  616:    MPI_Datatype datatype, 
        -:  617:    int root, 
        -:  618:    MPID_Comm *comm_ptr)
    22654:  619:{
        -:  620:    int rank, comm_size;
        -:  621:    int relative_rank;
    22654:  622:    int mpi_errno = MPI_SUCCESS;
        -:  623:    int scatter_size, nbytes;
        -:  624:    int type_size, j, i, is_contig, is_homogeneous;
        -:  625:    int position;
        -:  626:    int *recvcnts, *displs, left, right, jnext;
        -:  627:    void *tmp_buf;
        -:  628:    MPI_Comm comm;
        -:  629:    MPID_Datatype *dtp;
        -:  630:    MPI_Aint true_extent, true_lb;
    22654:  631:    MPIU_CHKLMEM_DECL(3);
        -:  632:
    22654:  633:    comm = comm_ptr->handle;
    22654:  634:    comm_size = comm_ptr->local_size;
    22654:  635:    rank = comm_ptr->rank;
    22654:  636:    relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
        -:  637:
        -:  638:    /* If there is only one process, return */
    22654:  639:    if (comm_size == 1) goto fn_exit;
        -:  640:
    22654:  641:    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)
    14374:  642:        is_contig = 1;
        -:  643:    else {
     8280:  644:        MPID_Datatype_get_ptr(datatype, dtp); 
     8280:  645:        is_contig = dtp->is_contig;
        -:  646:    }
        -:  647:
    22654:  648:    is_homogeneous = 1;
        -:  649:#ifdef MPID_HAS_HETERO
        -:  650:    if (comm_ptr->is_hetero)
        -:  651:        is_homogeneous = 0;
        -:  652:#endif
        -:  653:
        -:  654:    /* MPI_Type_size() might not give the accurate size of the packed
        -:  655:     * datatype for heterogeneous systems (because of padding, encoding,
        -:  656:     * etc). On the other hand, MPI_Pack_size() can become very
        -:  657:     * expensive, depending on the implementation, especially for
        -:  658:     * heterogeneous systems. We want to use MPI_Type_size() wherever
        -:  659:     * possible, and MPI_Pack_size() in other places.
        -:  660:     */
    22654:  661:    if (is_homogeneous) {
    22654:  662:        MPID_Datatype_get_size_macro(datatype, type_size);
        -:  663:    }
        -:  664:    else {
    #####:  665:        mpi_errno = NMPI_Pack_size(1, datatype, comm, &type_size);
    #####:  666:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  667:    }
    22654:  668:    nbytes = type_size * count;
        -:  669:
    22654:  670:    if (is_contig && is_homogeneous)
        -:  671:    {
        -:  672:        /* contiguous and homogeneous. no need to pack. */
    19010:  673:        mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb,
        -:  674:                                              &true_extent);
    19010:  675:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  676:
    19010:  677:        tmp_buf = (char *) buffer + true_lb;
        -:  678:    }
        -:  679:    else
        -:  680:    {
     3644:  681:        MPIU_CHKLMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf");
        -:  682:
        -:  683:        /* TODO: Pipeline the packing and communication */
     3644:  684:        position = 0;
     3644:  685:        if (rank == root)
      796:  686:            NMPI_Pack(buffer, count, datatype, tmp_buf, nbytes,
        -:  687:                      &position, comm);
        -:  688:    }
        -:  689:
    22654:  690:    scatter_size = (nbytes + comm_size - 1)/comm_size; /* ceiling division */
        -:  691:
    22654:  692:    mpi_errno = scatter_for_bcast(buffer, count, datatype, root, comm_ptr,
        -:  693:                                  nbytes, tmp_buf, is_contig, is_homogeneous);
    22654:  694:    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  695:
        -:  696:    /* long-message allgather or medium-size but non-power-of-two. use ring algorithm. */ 
        -:  697:
    22654:  698:    MPIU_CHKLMEM_MALLOC(recvcnts, int *, comm_size*sizeof(int), mpi_errno, "recvcnts");
    22654:  699:    MPIU_CHKLMEM_MALLOC(displs,   int *, comm_size*sizeof(int), mpi_errno, "displs");
        -:  700:
   243656:  701:    for (i=0; i<comm_size; i++)
        -:  702:    {
   221002:  703:        recvcnts[i] = nbytes - i*scatter_size;
   221002:  704:        if (recvcnts[i] > scatter_size)
   198348:  705:            recvcnts[i] = scatter_size;
   221002:  706:        if (recvcnts[i] < 0)
    #####:  707:            recvcnts[i] = 0;
        -:  708:    }
        -:  709:
    22654:  710:    displs[0] = 0;
   221002:  711:    for (i=1; i<comm_size; i++)
   198348:  712:        displs[i] = displs[i-1] + recvcnts[i-1];
        -:  713:
    22654:  714:    left  = (comm_size + rank - 1) % comm_size;
    22654:  715:    right = (rank + 1) % comm_size;
        -:  716:
    22654:  717:    j     = rank;
    22654:  718:    jnext = left;
   221002:  719:    for (i=1; i<comm_size; i++)
        -:  720:    {
   198348:  721:        mpi_errno = 
        -:  722:            MPIC_Sendrecv((char *)tmp_buf +
        -:  723:                          displs[(j-root+comm_size)%comm_size],  
        -:  724:                          recvcnts[(j-root+comm_size)%comm_size],
        -:  725:                          MPI_BYTE, right, MPIR_BCAST_TAG, 
        -:  726:                          (char *)tmp_buf +
        -:  727:                          displs[(jnext-root+comm_size)%comm_size], 
        -:  728:                          recvcnts[(jnext-root+comm_size)%comm_size],  
        -:  729:                          MPI_BYTE, left,   
        -:  730:                          MPIR_BCAST_TAG, comm, MPI_STATUS_IGNORE);
   198348:  731:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  732:
   198348:  733:        j     = jnext;
   198348:  734:        jnext = (comm_size + jnext - 1) % comm_size;
        -:  735:    }
        -:  736:
    22654:  737:    if (!is_contig || !is_homogeneous)
        -:  738:    {
     3644:  739:        if (rank != root)
        -:  740:        {
     2848:  741:            position = 0;
     2848:  742:            NMPI_Unpack(tmp_buf, nbytes, &position, buffer, count,
        -:  743:                        datatype, comm);
        -:  744:        }
        -:  745:    }
        -:  746:
        -:  747:fn_exit:
    48952:  748:    MPIU_CHKLMEM_FREEALL();
    22654:  749:    return mpi_errno;
        -:  750:fn_fail:
        -:  751:    goto fn_exit;
        -:  752:}
        -:  753:
        -:  754:/* A convenience function to de-clutter code and minimize copy-paste bugs.
        -:  755:   Invokes the coll_fns->Bcast override for the given communicator if non-null.
        -:  756:   Otherwise it invokes bcast_fn_ with the given args.
        -:  757:   
        -:  758:   NOTE: calls MPIU_ERR_POP on any failure, so a fn_fail label is needed. */
        -:  759:#define MPIR_Bcast_fn_or_override(bcast_fn_,mpi_errno_,buffer_,count_,datatype_,root_,comm_ptr_) \
        -:  760:    do {                                                                                         \
        -:  761:        if (comm_ptr_->coll_fns != NULL && comm_ptr_->coll_fns->Bcast != NULL)                   \
        -:  762:        {                                                                                        \
        -:  763:            /* --BEGIN USEREXTENSION-- */                                                        \
        -:  764:            mpi_errno_ = comm_ptr->coll_fns->Bcast(buffer_, count_,                              \
        -:  765:                                                   datatype_, root_, comm_ptr_);                 \
        -:  766:            /* --END USEREXTENSION-- */                                                          \
        -:  767:        }                                                                                        \
        -:  768:        else                                                                                     \
        -:  769:        {                                                                                        \
        -:  770:            mpi_errno_ = bcast_fn_(buffer_, count_, datatype_, root_, comm_ptr_);                \
        -:  771:        }                                                                                        \
        -:  772:        if (mpi_errno_) MPIU_ERR_POP(mpi_errno_);                                                \
        -:  773:    } while (0)
        -:  774:
        -:  775:/* FIXME This function uses some heuristsics based off of some testing on a
        -:  776: * cluster at Argonne.  We need a better system for detrmining and controlling
        -:  777: * the cutoff points for these algorithms.  If I've done this right, you should
        -:  778: * be able to make changes along these lines almost exclusively in this function
        -:  779: * and some new functions. [goodell@ 2008/01/07] */
        -:  780:/* begin:nested */
        -:  781:static int MPIR_SMP_Bcast(
        -:  782:        void *buffer, 
        -:  783:        int count, 
        -:  784:        MPI_Datatype datatype, 
        -:  785:        int root, 
        -:  786:        MPID_Comm *comm_ptr)
   128232:  787:{
   128232:  788:    int mpi_errno = MPI_SUCCESS;
        -:  789:    int type_size, is_homogeneous;
   128232:  790:    int nbytes=0;
        -:  791:
        -:  792:#if !defined(USE_SMP_COLLECTIVES)
        -:  793:    MPIU_Assert(0);
        -:  794:#endif
   128232:  795:    MPIU_Assert(MPIR_Comm_is_node_aware(comm_ptr));
        -:  796:
   128232:  797:    is_homogeneous = 1;
        -:  798:#ifdef MPID_HAS_HETERO
        -:  799:    if (comm_ptr->is_hetero)
        -:  800:        is_homogeneous = 0;
        -:  801:#endif
        -:  802:
        -:  803:    /* MPI_Type_size() might not give the accurate size of the packed
        -:  804:     * datatype for heterogeneous systems (because of padding, encoding,
        -:  805:     * etc). On the other hand, MPI_Pack_size() can become very
        -:  806:     * expensive, depending on the implementation, especially for
        -:  807:     * heterogeneous systems. We want to use MPI_Type_size() wherever
        -:  808:     * possible, and MPI_Pack_size() in other places.
        -:  809:     */
   128232:  810:    if (is_homogeneous) {
   128232:  811:        MPID_Datatype_get_size_macro(datatype, type_size);
        -:  812:    }
        -:  813:    else {
    #####:  814:        mpi_errno = NMPI_Pack_size(1, datatype, comm_ptr->handle, &type_size);
    #####:  815:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  816:    }
   128232:  817:    nbytes = type_size * count;
        -:  818:
   128232:  819:    if ((nbytes < MPIR_BCAST_SHORT_MSG) || (comm_ptr->local_size < MPIR_BCAST_MIN_PROCS))
        -:  820:    {
        -:  821:        /* send to intranode-rank 0 on the root's node */
   102186:  822:        if (comm_ptr->node_comm != NULL &&
        -:  823:            MPIU_Get_intranode_rank(comm_ptr, root) > 0) /* is not the node root (0) */ 
        -:  824:        {                                                /* and is on our node (!-1) */
    57112:  825:            if (root == comm_ptr->rank) {
     8066:  826:                mpi_errno = MPIC_Send(buffer,count,datatype,0,
        -:  827:                                      MPIR_BCAST_TAG,comm_ptr->node_comm->handle); 
        -:  828:            }
    49046:  829:            else if (0 == comm_ptr->node_comm->rank) {
     8066:  830:                mpi_errno = MPIC_Recv(buffer,count,datatype,MPIU_Get_intranode_rank(comm_ptr, root),
        -:  831:                                      MPIR_BCAST_TAG,comm_ptr->node_comm->handle,MPI_STATUS_IGNORE);
        -:  832:            }
    57112:  833:            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  834:        }
        -:  835:
        -:  836:        /* perform the internode broadcast */
   102186:  837:        if (comm_ptr->node_roots_comm != NULL)
        -:  838:        {
    20786:  839:            MPIR_Bcast_fn_or_override(MPIR_Bcast_binomial, mpi_errno,
        -:  840:                                      buffer, count, datatype,
        -:  841:                                      MPIU_Get_internode_rank(comm_ptr, root),
        -:  842:                                      comm_ptr->node_roots_comm);
        -:  843:        }
        -:  844:
        -:  845:        /* perform the intranode broadcast on all except for the root's node */
   102186:  846:        if (comm_ptr->node_comm != NULL)
        -:  847:        {
   102186:  848:            MPIR_Bcast_fn_or_override(MPIR_Bcast_binomial, mpi_errno,
        -:  849:                                      buffer, count, datatype, 0, comm_ptr->node_comm);
        -:  850:        }
        -:  851:    }
        -:  852:    else /* (nbytes > MPIR_BCAST_SHORT_MSG) && (comm_ptr->size >= MPIR_BCAST_MIN_PROCS) */
        -:  853:    {
        -:  854:        /* supposedly...
        -:  855:           smp+doubling good for pof2
        -:  856:           reg+ring better for non-pof2 */
    49372:  857:        if (nbytes < MPIR_BCAST_LONG_MSG && MPIU_is_pof2(comm_ptr->local_size, NULL))
        -:  858:        {
        -:  859:            /* medium-sized msg and pof2 np */
        -:  860:
        -:  861:            /* perform the intranode broadcast on the root's node */
     3392:  862:            if (comm_ptr->node_comm != NULL &&
        -:  863:                MPIU_Get_intranode_rank(comm_ptr, root) > 0) /* is not the node root (0) */ 
        -:  864:            {                                                /* and is on our node (!-1) */
        -:  865:                /* FIXME binomial may not be the best algorithm for on-node
        -:  866:                   bcast.  We need a more comprehensive system for selecting the
        -:  867:                   right algorithms here. */
     2912:  868:                MPIR_Bcast_fn_or_override(MPIR_Bcast_binomial, mpi_errno,
        -:  869:                                          buffer, count, datatype,
        -:  870:                                          MPIU_Get_intranode_rank(comm_ptr, root),
        -:  871:                                          comm_ptr->node_comm);
        -:  872:            }
        -:  873:
        -:  874:            /* perform the internode broadcast */
     3392:  875:            if (comm_ptr->node_roots_comm != NULL)
        -:  876:            {
      848:  877:                if (MPIU_is_pof2(comm_ptr->node_roots_comm->local_size, NULL))
        -:  878:                {
      424:  879:                    MPIR_Bcast_fn_or_override(MPIR_Bcast_scatter_doubling_allgather, mpi_errno,
        -:  880:                                              buffer, count, datatype,
        -:  881:                                              MPIU_Get_internode_rank(comm_ptr, root),
        -:  882:                                              comm_ptr->node_roots_comm);
        -:  883:                }
        -:  884:                else
        -:  885:                {
    #####:  886:                    MPIR_Bcast_fn_or_override(MPIR_Bcast_scatter_ring_allgather, mpi_errno,
        -:  887:                                              buffer, count, datatype,
        -:  888:                                              MPIU_Get_internode_rank(comm_ptr, root),
        -:  889:                                              comm_ptr->node_roots_comm);
        -:  890:                }
        -:  891:            }
        -:  892:
        -:  893:            /* perform the intranode broadcast on all except for the root's node */
     3392:  894:            if (comm_ptr->node_comm != NULL &&
        -:  895:                MPIU_Get_intranode_rank(comm_ptr, root) <= 0) /* 0 if root was local root too */
        -:  896:            {                                                 /* -1 if different node than root */
        -:  897:                /* FIXME binomial may not be the best algorithm for on-node
        -:  898:                   bcast.  We need a more comprehensive system for selecting the
        -:  899:                   right algorithms here. */
      480:  900:                MPIR_Bcast_fn_or_override(MPIR_Bcast_binomial, mpi_errno,
        -:  901:                                          buffer, count, datatype, 0, comm_ptr->node_comm);
        -:  902:            }
        -:  903:        }
        -:  904:        else /* large msg or non-pof2 */
        -:  905:        {
        -:  906:            /* FIXME It would be good to have an SMP-aware version of this
        -:  907:               algorithm that (at least approximately) minimized internode
        -:  908:               communication. */
    22654:  909:            mpi_errno = MPIR_Bcast_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr);
    22654:  910:            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  911:        }
        -:  912:    }
        -:  913:
   128232:  914:fn_exit:
   128232:  915:    return mpi_errno;
        -:  916:fn_fail:
        -:  917:    goto fn_exit;
        -:  918:}
        -:  919:/* end:nested */
        -:  920:
        -:  921:/* This is the default implementation of broadcast. The algorithm is:
        -:  922:   
        -:  923:   Algorithm: MPI_Bcast
        -:  924:
        -:  925:   For short messages, we use a binomial tree algorithm. 
        -:  926:   Cost = lgp.alpha + n.lgp.beta
        -:  927:
        -:  928:   For long messages, we do a scatter followed by an allgather. 
        -:  929:   We first scatter the buffer using a binomial tree algorithm. This costs
        -:  930:   lgp.alpha + n.((p-1)/p).beta
        -:  931:   If the datatype is contiguous and the communicator is homogeneous,
        -:  932:   we treat the data as bytes and divide (scatter) it among processes
        -:  933:   by using ceiling division. For the noncontiguous or heterogeneous
        -:  934:   cases, we first pack the data into a temporary buffer by using
        -:  935:   MPI_Pack, scatter it as bytes, and unpack it after the allgather.
        -:  936:
        -:  937:   For the allgather, we use a recursive doubling algorithm for 
        -:  938:   medium-size messages and power-of-two number of processes. This
        -:  939:   takes lgp steps. In each step pairs of processes exchange all the
        -:  940:   data they have (we take care of non-power-of-two situations). This
        -:  941:   costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
        -:  942:   because it may be slightly more in the non-power-of-two case, but
        -:  943:   it's still a logarithmic algorithm.) Therefore, for long messages
        -:  944:   Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
        -:  945:
        -:  946:   Note that this algorithm has twice the latency as the tree algorithm
        -:  947:   we use for short messages, but requires lower bandwidth: 2.n.beta
        -:  948:   versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
        -:  949:   this algorithm will perform better.
        -:  950:
        -:  951:   For long messages and for medium-size messages and non-power-of-two 
        -:  952:   processes, we use a ring algorithm for the allgather, which 
        -:  953:   takes p-1 steps, because it performs better than recursive doubling.
        -:  954:   Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
        -:  955:
        -:  956:   Possible improvements: 
        -:  957:   For clusters of SMPs, we may want to do something differently to
        -:  958:   take advantage of shared memory on each node.
        -:  959:
        -:  960:   End Algorithm: MPI_Bcast
        -:  961:*/
        -:  962:
        -:  963:#undef FUNCNAME
        -:  964:#define FUNCNAME MPIR_Bcast
        -:  965:#undef FCNAME
        -:  966:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  967:/* not declared static because it is called in intercomm. allgatherv */
        -:  968:int MPIR_Bcast ( 
        -:  969:        void *buffer, 
        -:  970:        int count, 
        -:  971:        MPI_Datatype datatype, 
        -:  972:        int root, 
        -:  973:        MPID_Comm *comm_ptr )
  2054590:  974:{
  2054590:  975:    int mpi_errno = MPI_SUCCESS;
        -:  976:    int comm_size;
  2054590:  977:    int nbytes=0;
        -:  978:    int type_size, is_homogeneous;
        -:  979:    MPI_Comm comm;
  2054590:  980:    MPIU_THREADPRIV_DECL;
        -:  981:    MPID_MPI_STATE_DECL(MPID_STATE_MPIR_BCAST);
        -:  982:
        -:  983:    MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_BCAST);
        -:  984:
        -:  985:    /* The various MPIR_Bcast_* impls use NMPI functions, so we bump the nest
        -:  986:       count here to avoid repeatedly calling incr/decr. */
  2054590:  987:    MPIU_THREADPRIV_GET;
  2054590:  988:    MPIR_Nest_incr(); 
        -:  989:
        -:  990:    /* check if multiple threads are calling this collective function */
        -:  991:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
        -:  992:
  2054590:  993:    if (count == 0) goto fn_exit;
        -:  994:
        -:  995:#if defined(USE_SMP_COLLECTIVES)
  2054254:  996:    if (MPIR_Comm_is_node_aware(comm_ptr)) {
   128232:  997:        mpi_errno = MPIR_SMP_Bcast(buffer, count, datatype, root, comm_ptr);
   128232:  998:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  999:        goto fn_exit;
        -: 1000:    }
        -: 1001:#endif
        -: 1002:
  1926022: 1003:    comm = comm_ptr->handle;
  1926022: 1004:    comm_size = comm_ptr->local_size;
        -: 1005:
  1926022: 1006:    is_homogeneous = 1;
        -: 1007:#ifdef MPID_HAS_HETERO
        -: 1008:    if (comm_ptr->is_hetero)
        -: 1009:        is_homogeneous = 0;
        -: 1010:#endif
        -: 1011:
        -: 1012:    /* MPI_Type_size() might not give the accurate size of the packed
        -: 1013:     * datatype for heterogeneous systems (because of padding, encoding,
        -: 1014:     * etc). On the other hand, MPI_Pack_size() can become very
        -: 1015:     * expensive, depending on the implementation, especially for
        -: 1016:     * heterogeneous systems. We want to use MPI_Type_size() wherever
        -: 1017:     * possible, and MPI_Pack_size() in other places.
        -: 1018:     */
  1926022: 1019:    if (is_homogeneous) {
  1926022: 1020:        MPID_Datatype_get_size_macro(datatype, type_size);
        -: 1021:    }
        -: 1022:    else {
    #####: 1023:        mpi_errno = NMPI_Pack_size(1, datatype, comm, &type_size);
    #####: 1024:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -: 1025:    }
  1926022: 1026:    nbytes = type_size * count;
        -: 1027:
  1926022: 1028:    if ((nbytes < MPIR_BCAST_SHORT_MSG) || (comm_size < MPIR_BCAST_MIN_PROCS))
        -: 1029:    {
  1926022: 1030:        mpi_errno = MPIR_Bcast_binomial(buffer, count, datatype, root, comm_ptr);
  1926022: 1031:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -: 1032:    }
        -: 1033:    else /* (nbytes >= MPIR_BCAST_SHORT_MSG) && (comm_size >= MPIR_BCAST_MIN_PROCS) */
        -: 1034:    {
    #####: 1035:        if ((nbytes < MPIR_BCAST_LONG_MSG) && (MPIU_is_pof2(comm_size, NULL)))
        -: 1036:        {
    #####: 1037:            mpi_errno = MPIR_Bcast_scatter_doubling_allgather(buffer, count, datatype, root, comm_ptr);
    #####: 1038:            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -: 1039:        }
        -: 1040:        else /* (nbytes >= MPIR_BCAST_LONG_MSG) || !(comm_size_is_pof2) */
        -: 1041:        {
        -: 1042:            /* We want the ring algorithm whether or not we have a
        -: 1043:               topologically aware communicator.  Doing inter/intra-node
        -: 1044:               communication phases breaks the pipelining of the algorithm.  */
    #####: 1045:            mpi_errno = MPIR_Bcast_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr);
    #####: 1046:            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -: 1047:        }
        -: 1048:    }
        -: 1049:
  2054590: 1050:fn_exit:
  2054590: 1051:    MPIR_Nest_decr();
        -: 1052:
        -: 1053:    /* check if multiple threads are calling this collective function */
        -: 1054:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
        -: 1055:
        -: 1056:    MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_BCAST);
        -: 1057:
  2054590: 1058:    return mpi_errno;
        -: 1059:fn_fail:
        -: 1060:    goto fn_exit;
        -: 1061:}
        -: 1062:
        -: 1063:/* A simple utility function to that calls the comm_ptr->coll_fns->Bcast
        -: 1064:   override if it exists or else it calls MPIR_Bcast with the same arguments.
        -: 1065:   This function just makes the high-level broadcast logic easier to read while
        -: 1066:   still accomodating coll_fns-style overrides.  It also reduces future errors
        -: 1067:   by eliminating the duplication of Bcast arguments. 
        -: 1068:
        -: 1069:   This routine is used in other files as well (barrier.c, allreduce.c)
        -: 1070:
        -: 1071:   TODO This function should be deprecated in favor of a direct call to
        -: 1072:   MPIR_Bcast now that we handle SMP-awareness inside of MPIR_Bcast instead
        -: 1073:   of MPI_Bcast.
        -: 1074:*/
        -: 1075:#undef FUNCNAME
        -: 1076:#define FUNCNAME MPIR_Bcast_or_coll_fn
        -: 1077:#undef FCNAME
        -: 1078:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -: 1079:int MPIR_Bcast_or_coll_fn(void *buffer, 
        -: 1080:			  int count, 
        -: 1081:			  MPI_Datatype datatype, 
        -: 1082:			  int root, 
        -: 1083:			  MPID_Comm *comm_ptr)
  1912590: 1084:{
  1912590: 1085:    int mpi_errno = MPI_SUCCESS;
        -: 1086:
  1912590: 1087:    if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Bcast != NULL)
        -: 1088:    {
        -: 1089:        /* --BEGIN USEREXTENSION-- */
    #####: 1090:        mpi_errno = comm_ptr->node_roots_comm->coll_fns->Bcast(buffer, count,
        -: 1091:                                                               datatype, root, comm_ptr);
        -: 1092:        /* --END USEREXTENSION-- */
        -: 1093:    }
        -: 1094:    else {
  1912590: 1095:        mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr);
        -: 1096:    }
        -: 1097:
  1912590: 1098:    return mpi_errno;
        -: 1099:}
        -: 1100:
        -: 1101:#undef FUNCNAME
        -: 1102:#define FUNCNAME MPIR_Bcast_inter
        -: 1103:#undef FCNAME
        -: 1104:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -: 1105:/* begin:nested */
        -: 1106:/* Not PMPI_LOCAL because it is called in intercomm allgather */
        -: 1107:int MPIR_Bcast_inter ( 
        -: 1108:    void *buffer, 
        -: 1109:    int count, 
        -: 1110:    MPI_Datatype datatype, 
        -: 1111:    int root, 
        -: 1112:    MPID_Comm *comm_ptr )
     7182: 1113:{
        -: 1114:/*  Intercommunicator broadcast.
        -: 1115:    Root sends to rank 0 in remote group. Remote group does local
        -: 1116:    intracommunicator broadcast.
        -: 1117:*/
        -: 1118:    int rank, mpi_errno;
        -: 1119:    MPI_Status status;
     7182: 1120:    MPID_Comm *newcomm_ptr = NULL;
        -: 1121:    MPI_Comm comm;
        -: 1122:    MPID_MPI_STATE_DECL(MPID_STATE_MPIR_BCAST_INTER);
        -: 1123:
        -: 1124:    MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_BCAST_INTER);
        -: 1125:
     7182: 1126:    comm = comm_ptr->handle;
        -: 1127:
     7182: 1128:    if (root == MPI_PROC_NULL)
        -: 1129:    {
        -: 1130:        /* local processes other than root do nothing */
     2343: 1131:        mpi_errno = MPI_SUCCESS;
        -: 1132:    }
     4839: 1133:    else if (root == MPI_ROOT)
        -: 1134:    {
        -: 1135:        /* root sends to rank 0 on remote group and returns */
        -: 1136:        MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
     1196: 1137:        mpi_errno =  MPIC_Send(buffer, count, datatype, 0,
        -: 1138:                               MPIR_BCAST_TAG, comm); 
     1196: 1139:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -: 1140:        MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
        -: 1141:    }
        -: 1142:    else
        -: 1143:    {
        -: 1144:        /* remote group. rank 0 on remote group receives from root */
        -: 1145:        
     3643: 1146:        rank = comm_ptr->rank;
        -: 1147:        
     3643: 1148:        if (rank == 0)
        -: 1149:        {
     1196: 1150:            mpi_errno = MPIC_Recv(buffer, count, datatype, root,
        -: 1151:                                  MPIR_BCAST_TAG, comm, &status);
     1196: 1152:            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -: 1153:        }
        -: 1154:        
        -: 1155:        /* Get the local intracommunicator */
     3643: 1156:        if (!comm_ptr->local_comm)
       97: 1157:            MPIR_Setup_intercomm_localcomm( comm_ptr );
        -: 1158:
     3643: 1159:        newcomm_ptr = comm_ptr->local_comm;
        -: 1160:
        -: 1161:        /* now do the usual broadcast on this intracommunicator
        -: 1162:           with rank 0 as root. */
     3643: 1163:        mpi_errno = MPIR_Bcast(buffer, count, datatype, 0, newcomm_ptr);
     3643: 1164:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -: 1165:    }
        -: 1166:
     7182: 1167:fn_fail:
        -: 1168:    MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_BCAST_INTER);
     7182: 1169:    return mpi_errno;
        -: 1170:}
        -: 1171:/* end:nested */
        -: 1172:#endif /* MPICH_MPI_FROM_PMPI */
        -: 1173:
        -: 1174:#undef FUNCNAME
        -: 1175:#define FUNCNAME MPI_Bcast
        -: 1176:#undef FCNAME
        -: 1177:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -: 1178:
        -: 1179:/*@
        -: 1180:MPI_Bcast - Broadcasts a message from the process with rank "root" to
        -: 1181:            all other processes of the communicator
        -: 1182:
        -: 1183:Input/Output Parameter:
        -: 1184:. buffer - starting address of buffer (choice) 
        -: 1185:
        -: 1186:Input Parameters:
        -: 1187:+ count - number of entries in buffer (integer) 
        -: 1188:. datatype - data type of buffer (handle) 
        -: 1189:. root - rank of broadcast root (integer) 
        -: 1190:- comm - communicator (handle) 
        -: 1191:
        -: 1192:.N ThreadSafe
        -: 1193:
        -: 1194:.N Fortran
        -: 1195:
        -: 1196:.N Errors
        -: 1197:.N MPI_SUCCESS
        -: 1198:.N MPI_ERR_COMM
        -: 1199:.N MPI_ERR_COUNT
        -: 1200:.N MPI_ERR_TYPE
        -: 1201:.N MPI_ERR_BUFFER
        -: 1202:.N MPI_ERR_ROOT
        -: 1203:@*/
        -: 1204:int MPI_Bcast( void *buffer, int count, MPI_Datatype datatype, int root, 
        -: 1205:               MPI_Comm comm )
   123941: 1206:{
   123941: 1207:    int mpi_errno = MPI_SUCCESS;
   123941: 1208:    MPID_Comm *comm_ptr = NULL;
   123941: 1209:    MPIU_THREADPRIV_DECL;
        -: 1210:    MPID_MPI_STATE_DECL(MPID_STATE_MPI_BCAST);
        -: 1211:
   123941: 1212:    MPIR_ERRTEST_INITIALIZED_ORDIE();
        -: 1213:    
   123941: 1214:    MPIU_THREAD_CS_ENTER(ALLFUNC,);
        -: 1215:    MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_BCAST);
        -: 1216:
        -: 1217:    /* Validate parameters, especially handles needing to be converted */
        -: 1218:#   ifdef HAVE_ERROR_CHECKING
        -: 1219:    {
        -: 1220:        MPID_BEGIN_ERROR_CHECKS;
        -: 1221:        {
   123941: 1222:	    MPIR_ERRTEST_COMM(comm, mpi_errno);
   123941: 1223:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -: 1224:	}
        -: 1225:        MPID_END_ERROR_CHECKS;
        -: 1226:    }
        -: 1227:#   endif /* HAVE_ERROR_CHECKING */
        -: 1228:
        -: 1229:    /* Convert MPI object handles to object pointers */
   123939: 1230:    MPID_Comm_get_ptr( comm, comm_ptr );
        -: 1231:
        -: 1232:    /* Validate parameters and objects (post conversion) */
        -: 1233:#   ifdef HAVE_ERROR_CHECKING
        -: 1234:    {
        -: 1235:        MPID_BEGIN_ERROR_CHECKS;
        -: 1236:        {
   123939: 1237:            MPID_Datatype *datatype_ptr = NULL;
        -: 1238:	    
   123939: 1239:            MPID_Comm_valid_ptr( comm_ptr, mpi_errno );
   123939: 1240:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
   123937: 1241:	    MPIR_ERRTEST_COUNT(count, mpi_errno);
   123937: 1242:	    MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno);
   123937: 1243:	    if (comm_ptr->comm_kind == MPID_INTRACOMM) {
   121833: 1244:		MPIR_ERRTEST_INTRA_ROOT(comm_ptr, root, mpi_errno);
        -: 1245:	    }
        -: 1246:	    else {
     2104: 1247:		MPIR_ERRTEST_INTER_ROOT(comm_ptr, root, mpi_errno);
        -: 1248:	    }
        -: 1249:	    
   123937: 1250:            if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
    33240: 1251:                MPID_Datatype_get_ptr(datatype, datatype_ptr);
    33240: 1252:                MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno );
    33240: 1253:                MPID_Datatype_committed_ptr( datatype_ptr, mpi_errno );
        -: 1254:            }
        -: 1255:
   123937: 1256:            MPIR_ERRTEST_BUF_INPLACE(buffer, count, mpi_errno);
   123937: 1257:            MPIR_ERRTEST_USERBUFFER(buffer,count,datatype,mpi_errno);
        -: 1258:            
   123937: 1259:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -: 1260:        }
        -: 1261:        MPID_END_ERROR_CHECKS;
        -: 1262:    }
        -: 1263:#   endif /* HAVE_ERROR_CHECKING */
        -: 1264:
        -: 1265:    /* ... body of routine ...  */
        -: 1266:
   123929: 1267:    if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Bcast != NULL)
        -: 1268:    {
        -: 1269:	/* --BEGIN USEREXTENSION-- */
    #####: 1270:	mpi_errno = comm_ptr->coll_fns->Bcast(buffer, count,
        -: 1271:                                              datatype, root, comm_ptr);
        -: 1272:	/* --END USEREXTENSION-- */
        -: 1273:    }
        -: 1274:    else
        -: 1275:    {
   123929: 1276:        if (comm_ptr->comm_kind == MPID_INTRACOMM)
        -: 1277:	{
        -: 1278:            /* intracommunicator */
   121825: 1279:            mpi_errno = MPIR_Bcast( buffer, count, datatype, root, comm_ptr );
        -: 1280:	}
        -: 1281:        else
        -: 1282:	{
        -: 1283:            /* intercommunicator */
     2104: 1284:            mpi_errno = MPIR_Bcast_inter( buffer, count, datatype, root, comm_ptr );
        -: 1285:        }
        -: 1286:    }
        -: 1287:
   123929: 1288:    if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -: 1289:
        -: 1290:    /* ... end of body of routine ... */
        -: 1291:    
   123941: 1292:  fn_exit:
        -: 1293:    MPID_MPI_COLL_FUNC_EXIT(MPID_STATE_MPI_BCAST);
   123941: 1294:    MPIU_THREAD_CS_EXIT(ALLFUNC,);
   123941: 1295:    return mpi_errno;
        -: 1296:
       12: 1297:  fn_fail:
        -: 1298:    /* --BEGIN ERROR HANDLING-- */
        -: 1299:#   ifdef HAVE_ERROR_CHECKING
        -: 1300:    {
       12: 1301:	mpi_errno = MPIR_Err_create_code(
        -: 1302:	    mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, 
        -: 1303:	    "**mpi_bcast",
        -: 1304:	    "**mpi_bcast %p %d %D %d %C", buffer, count, datatype, root, comm);
        -: 1305:    }
        -: 1306:#   endif
       12: 1307:    mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno );
       12: 1308:    goto fn_exit;
        -: 1309:    /* --END ERROR HANDLING-- */
        -: 1310:}