-:    0:Source:/home/MPI/testing/mpich2/mpich2/src/mpi/coll/allgather.c
        -:    0:Graph:allgather.gcno
        -:    0:Data:allgather.gcda
        -:    0:Runs:4000
        -:    0:Programs:1232
        -:    1:/* -*- Mode: C; c-basic-offset:4 ; -*- */
        -:    2:/*
        -:    3: *
        -:    4: *  (C) 2001 by Argonne National Laboratory.
        -:    5: *      See COPYRIGHT in top-level directory.
        -:    6: */
        -:    7:
        -:    8:#include "mpiimpl.h"
        -:    9:
        -:   10:/* -- Begin Profiling Symbol Block for routine MPI_Allgather */
        -:   11:#if defined(HAVE_PRAGMA_WEAK)
        -:   12:#pragma weak MPI_Allgather = PMPI_Allgather
        -:   13:#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
        -:   14:#pragma _HP_SECONDARY_DEF PMPI_Allgather  MPI_Allgather
        -:   15:#elif defined(HAVE_PRAGMA_CRI_DUP)
        -:   16:#pragma _CRI duplicate MPI_Allgather as PMPI_Allgather
        -:   17:#endif
        -:   18:/* -- End Profiling Symbol Block */
        -:   19:
        -:   20:/* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
        -:   21:   the MPI routines */
        -:   22:#ifndef MPICH_MPI_FROM_PMPI
        -:   23:#undef MPI_Allgather
        -:   24:#define MPI_Allgather PMPI_Allgather
        -:   25:
        -:   26:/* This is the default implementation of allgather. The algorithm is:
        -:   27:   
        -:   28:   Algorithm: MPI_Allgather
        -:   29:
        -:   30:   For short messages and non-power-of-two no. of processes, we use
        -:   31:   the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
        -:   32:   paper. It is a variant of the disemmination algorithm for
        -:   33:   barrier. It takes ceiling(lg p) steps.
        -:   34:
        -:   35:   Cost = lgp.alpha + n.((p-1)/p).beta
        -:   36:   where n is total size of data gathered on each process.
        -:   37:
        -:   38:   For short or medium-size messages and power-of-two no. of
        -:   39:   processes, we use the recursive doubling algorithm.
        -:   40:
        -:   41:   Cost = lgp.alpha + n.((p-1)/p).beta
        -:   42:
        -:   43:   TODO: On TCP, we may want to use recursive doubling instead of the Bruck
        -:   44:   algorithm in all cases because of the pairwise-exchange property of
        -:   45:   recursive doubling (see Benson et al paper in Euro PVM/MPI
        -:   46:   2003).
        -:   47:
        -:   48:   It is interesting to note that either of the above algorithms for
        -:   49:   MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
        -:   50:
        -:   51:   For long messages or medium-size messages and non-power-of-two
        -:   52:   no. of processes, we use a ring algorithm. In the first step, each
        -:   53:   process i sends its contribution to process i+1 and receives
        -:   54:   the contribution from process i-1 (with wrap-around). From the
        -:   55:   second step onwards, each process i forwards to process i+1 the
        -:   56:   data it received from process i-1 in the previous step. This takes
        -:   57:   a total of p-1 steps.
        -:   58:
        -:   59:   Cost = (p-1).alpha + n.((p-1)/p).beta
        -:   60:
        -:   61:   We use this algorithm instead of recursive doubling for long
        -:   62:   messages because we find that this communication pattern (nearest
        -:   63:   neighbor) performs twice as fast as recursive doubling for long
        -:   64:   messages (on Myrinet and IBM SP).
        -:   65:
        -:   66:   Possible improvements: 
        -:   67:
        -:   68:   End Algorithm: MPI_Allgather
        -:   69:*/
        -:   70:/* begin:nested */
        -:   71:/* not declared static because a machine-specific function may call this 
        -:   72:   one in some cases */
        -:   73:int MPIR_Allgather ( 
        -:   74:    void *sendbuf, 
        -:   75:    int sendcount, 
        -:   76:    MPI_Datatype sendtype,
        -:   77:    void *recvbuf, 
        -:   78:    int recvcount, 
        -:   79:    MPI_Datatype recvtype, 
        -:   80:    MPID_Comm *comm_ptr )
  2571022:   81:{
        -:   82:    int        comm_size, rank;
  2571022:   83:    int        mpi_errno = MPI_SUCCESS;
        -:   84:    MPI_Aint   recvtype_extent, tot_bytes;
        -:   85:    MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb;
        -:   86:    int        j, i, pof2, src, rem;
        -:   87:    static const char FCNAME[] = "MPIR_Allgather";
  2571022:   88:    void *tmp_buf = NULL;
        -:   89:    int curr_cnt, dst, type_size, left, right, jnext, comm_size_is_pof2;
        -:   90:    MPI_Comm comm;
        -:   91:    MPI_Status status;
        -:   92:    int mask, dst_tree_root, my_tree_root, is_homogeneous,  
  2571022:   93:        send_offset, recv_offset, last_recv_cnt = 0, nprocs_completed, k,
        -:   94:        offset, tmp_mask, tree_root;
        -:   95:#ifdef MPID_HAS_HETERO
        -:   96:    int position, tmp_buf_size, nbytes;
        -:   97:#endif
        -:   98:
  2571022:   99:    MPIU_CHKLMEM_DECL(1);
        -:  100:
  2571022:  101:    if (((sendcount == 0) && (sendbuf != MPI_IN_PLACE)) || (recvcount == 0))
      220:  102:        return MPI_SUCCESS;
        -:  103:    
  2570802:  104:    comm = comm_ptr->handle;
  2570802:  105:    comm_size = comm_ptr->local_size;
  2570802:  106:    rank = comm_ptr->rank;
        -:  107:
  2570802:  108:    MPID_Datatype_get_extent_macro( recvtype, recvtype_extent );
  2570802:  109:    MPID_Datatype_get_size_macro( recvtype, type_size );
        -:  110:
        -:  111:    /* This is the largest offset we add to recvbuf */
        -:  112:    MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT recvbuf +
        -:  113:				     (comm_size * recvcount * recvtype_extent));
        -:  114:
        -:  115:    /* check if comm_size is a power of two */
  2570802:  116:    pof2 = 1;
 10280982:  117:    while (pof2 < comm_size)
  5139378:  118:        pof2 *= 2;
  2570802:  119:    if (pof2 == comm_size) 
  1981625:  120:        comm_size_is_pof2 = 1;
        -:  121:    else
   589177:  122:        comm_size_is_pof2 = 0;
        -:  123:
        -:  124:    /* check if multiple threads are calling this collective function */
        -:  125:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
        -:  126:
  2570802:  127:    tot_bytes = (MPI_Aint)recvcount * comm_size * type_size;
  2570802:  128:    if ((tot_bytes < MPIR_ALLGATHER_LONG_MSG) && (comm_size_is_pof2 == 1)) {
        -:  129:
        -:  130:        /* Short or medium size message and power-of-two no. of processes. Use
        -:  131:         * recursive doubling algorithm */   
        -:  132:
  1981609:  133:    is_homogeneous = 1;
        -:  134:#ifdef MPID_HAS_HETERO
        -:  135:    if (comm_ptr->is_hetero)
        -:  136:        is_homogeneous = 0;
        -:  137:#endif
        -:  138:    
  1981609:  139:        if (is_homogeneous) {
        -:  140:            /* homogeneous. no need to pack into tmp_buf on each node. copy
        -:  141:               local data into recvbuf */ 
  1981609:  142:            if (sendbuf != MPI_IN_PLACE) {
  1553348:  143:                mpi_errno = MPIR_Localcopy (sendbuf, sendcount, sendtype,
        -:  144:                                            ((char *)recvbuf +
        -:  145:                                             rank*recvcount*recvtype_extent), 
        -:  146:                                            recvcount, recvtype);
  1553348:  147:		if (mpi_errno) { 
    #####:  148:		    MPIU_ERR_POP(mpi_errno);
        -:  149:		}
        -:  150:            }
        -:  151:            
  1981609:  152:            curr_cnt = recvcount;
        -:  153:            
  1981609:  154:            mask = 0x1;
  1981609:  155:            i = 0;
  7915852:  156:            while (mask < comm_size) {
  3952634:  157:                dst = rank ^ mask;
        -:  158:                
        -:  159:                /* find offset into send and recv buffers. zero out 
        -:  160:                   the least significant "i" bits of rank and dst to 
        -:  161:                   find root of src and dst subtrees. Use ranks of 
        -:  162:                   roots as index to send from and recv into buffer */ 
        -:  163:                
  3952634:  164:                dst_tree_root = dst >> i;
  3952634:  165:                dst_tree_root <<= i;
        -:  166:                
  3952634:  167:                my_tree_root = rank >> i;
  3952634:  168:                my_tree_root <<= i;
        -:  169:
        -:  170:		/* FIXME: saving an MPI_Aint into an int */
  3952634:  171:                send_offset = my_tree_root * recvcount * recvtype_extent;
  3952634:  172:                recv_offset = dst_tree_root * recvcount * recvtype_extent;
        -:  173:                
  3952634:  174:                if (dst < comm_size) {
  3952634:  175:                    mpi_errno = MPIC_Sendrecv(((char *)recvbuf + send_offset),
        -:  176:                                              curr_cnt, recvtype, dst,
        -:  177:                                              MPIR_ALLGATHER_TAG,  
        -:  178:                                              ((char *)recvbuf + recv_offset),
        -:  179:					      (comm_size-dst_tree_root)*recvcount,
        -:  180:                                              recvtype, dst,
        -:  181:                                              MPIR_ALLGATHER_TAG, comm, &status);
  3952634:  182:		    if (mpi_errno) { 
    #####:  183:			MPIU_ERR_POP(mpi_errno);
        -:  184:		    }
        -:  185:                    
  3952634:  186:                    NMPI_Get_count(&status, recvtype, &last_recv_cnt);
  3952634:  187:                    curr_cnt += last_recv_cnt;
        -:  188:                }
        -:  189:                
        -:  190:                /* if some processes in this process's subtree in this step
        -:  191:                   did not have any destination process to communicate with
        -:  192:                   because of non-power-of-two, we need to send them the
        -:  193:                   data that they would normally have received from those
        -:  194:                   processes. That is, the haves in this subtree must send to
        -:  195:                   the havenots. We use a logarithmic recursive-halfing algorithm
        -:  196:                   for this. */
        -:  197:                
        -:  198:                /* This part of the code will not currently be
        -:  199:                 executed because we are not using recursive
        -:  200:                 doubling for non power of two. Mark it as experimental
        -:  201:                 so that it doesn't show up as red in the coverage
        -:  202:                 tests. */  
        -:  203:
        -:  204:		/* --BEGIN EXPERIMENTAL-- */
  3952634:  205:                if (dst_tree_root + mask > comm_size) {
    #####:  206:                    nprocs_completed = comm_size - my_tree_root - mask;
        -:  207:                    /* nprocs_completed is the number of processes in this
        -:  208:                       subtree that have all the data. Send data to others
        -:  209:                       in a tree fashion. First find root of current tree
        -:  210:                       that is being divided into two. k is the number of
        -:  211:                       least-significant bits in this process's rank that
        -:  212:                       must be zeroed out to find the rank of the root */ 
    #####:  213:                    j = mask;
    #####:  214:                    k = 0;
    #####:  215:                    while (j) {
    #####:  216:                        j >>= 1;
    #####:  217:                        k++;
        -:  218:                    }
    #####:  219:                    k--;
        -:  220:
        -:  221:		    /* FIXME: saving an MPI_Aint into an int */
    #####:  222:                    offset = recvcount * (my_tree_root + mask) * recvtype_extent;
    #####:  223:                    tmp_mask = mask >> 1;
        -:  224:                    
    #####:  225:                    while (tmp_mask) {
    #####:  226:                        dst = rank ^ tmp_mask;
        -:  227:                        
    #####:  228:                        tree_root = rank >> k;
    #####:  229:                        tree_root <<= k;
        -:  230:                        
        -:  231:                        /* send only if this proc has data and destination
        -:  232:                           doesn't have data. at any step, multiple processes
        -:  233:                           can send if they have the data */
    #####:  234:                        if ((dst > rank) && 
        -:  235:                            (rank < tree_root + nprocs_completed)
        -:  236:                            && (dst >= tree_root + nprocs_completed)) {
    #####:  237:                            mpi_errno = MPIC_Send(((char *)recvbuf + offset),
        -:  238:                                                  last_recv_cnt,
        -:  239:                                                  recvtype, dst,
        -:  240:                                                  MPIR_ALLGATHER_TAG, comm); 
        -:  241:                            /* last_recv_cnt was set in the previous
        -:  242:                               receive. that's the amount of data to be
        -:  243:                               sent now. */
    #####:  244:			    if (mpi_errno) { 
    #####:  245:				MPIU_ERR_POP(mpi_errno);
        -:  246:			    }
        -:  247:                        }
        -:  248:                        /* recv only if this proc. doesn't have data and sender
        -:  249:                           has data */
    #####:  250:                        else if ((dst < rank) && 
        -:  251:                                 (dst < tree_root + nprocs_completed) &&
        -:  252:                                 (rank >= tree_root + nprocs_completed)) {
    #####:  253:                            mpi_errno = MPIC_Recv(((char *)recvbuf + offset),  
        -:  254:						  (comm_size - (my_tree_root + mask))*recvcount,
        -:  255:                                                  recvtype, dst,
        -:  256:                                                  MPIR_ALLGATHER_TAG,
        -:  257:                                                  comm, &status); 
        -:  258:                            /* nprocs_completed is also equal to the
        -:  259:                               no. of processes whose data we don't have */
    #####:  260:			    if (mpi_errno) { 
    #####:  261:				MPIU_ERR_POP(mpi_errno);
        -:  262:			    }
    #####:  263:                            NMPI_Get_count(&status, recvtype, &last_recv_cnt);
    #####:  264:                            curr_cnt += last_recv_cnt;
        -:  265:                        }
    #####:  266:                        tmp_mask >>= 1;
    #####:  267:                        k--;
        -:  268:                    }
        -:  269:                }
        -:  270:                /* --END EXPERIMENTAL-- */
        -:  271:                
  3952634:  272:                mask <<= 1;
  3952634:  273:                i++;
        -:  274:            }
        -:  275:        }
        -:  276:        
        -:  277:#ifdef MPID_HAS_HETERO
        -:  278:        else { 
        -:  279:            /* heterogeneous. need to use temp. buffer. */
        -:  280:            
        -:  281:            NMPI_Pack_size(recvcount*comm_size, recvtype, comm, &tmp_buf_size);
        -:  282:            
        -:  283:            MPIU_CHKLMEM_MALLOC(tmp_buf, void*, tmp_buf_size, mpi_errno, "tmp_buf");
        -:  284:            
        -:  285:            /* calculate the value of nbytes, the number of bytes in packed
        -:  286:               representation that each process contributes. We can't simply divide
        -:  287:               tmp_buf_size by comm_size because tmp_buf_size is an upper
        -:  288:               bound on the amount of memory required. (For example, for
        -:  289:               a single integer, MPICH-1 returns pack_size=12.) Therefore, we
        -:  290:               actually pack some data into tmp_buf and see by how much
        -:  291:               'position' is incremented. */
        -:  292:            
        -:  293:            position = 0;
        -:  294:            NMPI_Pack(recvbuf, 1, recvtype, tmp_buf, tmp_buf_size,
        -:  295:                      &position, comm);
        -:  296:            nbytes = position*recvcount;
        -:  297:            
        -:  298:            /* pack local data into right location in tmp_buf */
        -:  299:            position = rank * nbytes;
        -:  300:            if (sendbuf != MPI_IN_PLACE) {
        -:  301:                NMPI_Pack(sendbuf, sendcount, sendtype, tmp_buf, tmp_buf_size,
        -:  302:                          &position, comm);
        -:  303:            }
        -:  304:            else {
        -:  305:                /* if in_place specified, local data is found in recvbuf */
        -:  306:                NMPI_Pack(((char *)recvbuf + recvtype_extent*rank), recvcount,
        -:  307:                          recvtype, tmp_buf, tmp_buf_size, 
        -:  308:                          &position, comm);
        -:  309:            }
        -:  310:            
        -:  311:            curr_cnt = nbytes;
        -:  312:            
        -:  313:            mask = 0x1;
        -:  314:            i = 0;
        -:  315:            while (mask < comm_size) {
        -:  316:                dst = rank ^ mask;
        -:  317:                
        -:  318:                /* find offset into send and recv buffers. zero out 
        -:  319:                   the least significant "i" bits of rank and dst to 
        -:  320:                   find root of src and dst subtrees. Use ranks of 
        -:  321:                   roots as index to send from and recv into buffer. */ 
        -:  322:                
        -:  323:                dst_tree_root = dst >> i;
        -:  324:                dst_tree_root <<= i;
        -:  325:                
        -:  326:                my_tree_root = rank >> i;
        -:  327:                my_tree_root <<= i;
        -:  328:                
        -:  329:                send_offset = my_tree_root * nbytes;
        -:  330:                recv_offset = dst_tree_root * nbytes;
        -:  331:                
        -:  332:                if (dst < comm_size) {
        -:  333:                    mpi_errno = MPIC_Sendrecv(((char *)tmp_buf + send_offset),
        -:  334:                                              curr_cnt, MPI_BYTE, dst,
        -:  335:                                              MPIR_ALLGATHER_TAG,  
        -:  336:                                              ((char *)tmp_buf + recv_offset),
        -:  337:					      tmp_buf_size - recv_offset,
        -:  338:                                              MPI_BYTE, dst,
        -:  339:                                              MPIR_ALLGATHER_TAG, comm, &status);
        -:  340:		    if (mpi_errno) { 
        -:  341:			MPIU_ERR_POP(mpi_errno);
        -:  342:		    }
        -:  343:                    
        -:  344:                    NMPI_Get_count(&status, MPI_BYTE, &last_recv_cnt);
        -:  345:                    curr_cnt += last_recv_cnt;
        -:  346:                }
        -:  347:                
        -:  348:                /* if some processes in this process's subtree in this step
        -:  349:                   did not have any destination process to communicate with
        -:  350:                   because of non-power-of-two, we need to send them the
        -:  351:                   data that they would normally have received from those
        -:  352:                   processes. That is, the haves in this subtree must send to
        -:  353:                   the havenots. We use a logarithmic recursive-halfing 
        -:  354:		   algorithm for this. */
        -:  355:                
        -:  356:                if (dst_tree_root + mask > comm_size) {
        -:  357:                    nprocs_completed = comm_size - my_tree_root - mask;
        -:  358:                    /* nprocs_completed is the number of processes in this
        -:  359:                       subtree that have all the data. Send data to others
        -:  360:                       in a tree fashion. First find root of current tree
        -:  361:                       that is being divided into two. k is the number of
        -:  362:                       least-significant bits in this process's rank that
        -:  363:                       must be zeroed out to find the rank of the root */ 
        -:  364:                    j = mask;
        -:  365:                    k = 0;
        -:  366:                    while (j) {
        -:  367:                        j >>= 1;
        -:  368:                        k++;
        -:  369:                    }
        -:  370:                    k--;
        -:  371:                    
        -:  372:                    offset = nbytes * (my_tree_root + mask);
        -:  373:                    tmp_mask = mask >> 1;
        -:  374:                    
        -:  375:                    while (tmp_mask) {
        -:  376:                        dst = rank ^ tmp_mask;
        -:  377:                        
        -:  378:                        tree_root = rank >> k;
        -:  379:                        tree_root <<= k;
        -:  380:                        
        -:  381:                        /* send only if this proc has data and destination
        -:  382:                           doesn't have data. at any step, multiple processes
        -:  383:                           can send if they have the data */
        -:  384:                        if ((dst > rank) && 
        -:  385:                            (rank < tree_root + nprocs_completed)
        -:  386:                            && (dst >= tree_root + nprocs_completed)) {
        -:  387:                            
        -:  388:                            mpi_errno = MPIC_Send(((char *)tmp_buf + offset),
        -:  389:                                                  last_recv_cnt, MPI_BYTE,
        -:  390:                                                  dst, MPIR_ALLGATHER_TAG,
        -:  391:                                                  comm);  
        -:  392:                            /* last_recv_cnt was set in the previous
        -:  393:                               receive. that's the amount of data to be
        -:  394:                               sent now. */
        -:  395:			    if (mpi_errno) { 
        -:  396:				MPIU_ERR_POP(mpi_errno);
        -:  397:			    }
        -:  398:                        }
        -:  399:                        /* recv only if this proc. doesn't have data and sender
        -:  400:                           has data */
        -:  401:                        else if ((dst < rank) && 
        -:  402:                                 (dst < tree_root + nprocs_completed) &&
        -:  403:                                 (rank >= tree_root + nprocs_completed)) {
        -:  404:                            mpi_errno = MPIC_Recv(((char *)tmp_buf + offset),
        -:  405:                                                  tmp_buf_size - offset,
        -:  406:                                                  MPI_BYTE, dst,
        -:  407:                                                  MPIR_ALLGATHER_TAG,
        -:  408:                                                  comm, &status); 
        -:  409:                            /* nprocs_completed is also equal to the
        -:  410:                               no. of processes whose data we don't have */
        -:  411:			    if (mpi_errno) { 
        -:  412:				MPIU_ERR_POP(mpi_errno);
        -:  413:			    }
        -:  414:                            NMPI_Get_count(&status, MPI_BYTE, &last_recv_cnt);
        -:  415:                            curr_cnt += last_recv_cnt;
        -:  416:                        }
        -:  417:                        tmp_mask >>= 1;
        -:  418:                        k--;
        -:  419:                    }
        -:  420:                }
        -:  421:                mask <<= 1;
        -:  422:                i++;
        -:  423:            }
        -:  424:            
        -:  425:            position = 0;
        -:  426:            NMPI_Unpack(tmp_buf, tmp_buf_size, &position, recvbuf,
        -:  427:                        recvcount*comm_size, recvtype, comm);            
        -:  428:        }
        -:  429:#endif /* MPID_HAS_HETERO */
        -:  430:    }
        -:  431:
   589193:  432:    else if (tot_bytes < MPIR_ALLGATHER_SHORT_MSG) {
        -:  433:        /* Short message and non-power-of-two no. of processes. Use
        -:  434:         * Bruck algorithm (see description above). */
        -:  435:
        -:  436:        /* allocate a temporary buffer of the same size as recvbuf. */
        -:  437:
        -:  438:        /* get true extent of recvtype */
   588775:  439:        mpi_errno = NMPI_Type_get_true_extent(recvtype, &recvtype_true_lb,
        -:  440:                                              &recvtype_true_extent);  
   588775:  441:	if (mpi_errno) { 
    #####:  442:	    MPIU_ERR_POP(mpi_errno);
        -:  443:	}
        -:  444:            
   588775:  445:        recvbuf_extent = recvcount * comm_size *
        -:  446:            (MPIR_MAX(recvtype_true_extent, recvtype_extent));
        -:  447:
   588775:  448:        MPIU_CHKLMEM_MALLOC(tmp_buf, void*, recvbuf_extent, mpi_errno, "tmp_buf");
        -:  449:            
        -:  450:        /* adjust for potential negative lower bound in datatype */
   588775:  451:        tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb);
        -:  452:
        -:  453:        /* copy local data to the top of tmp_buf */ 
   588775:  454:        if (sendbuf != MPI_IN_PLACE) {
   582901:  455:            mpi_errno = MPIR_Localcopy (sendbuf, sendcount, sendtype,
        -:  456:                                        tmp_buf, recvcount, recvtype);
   582901:  457:	    if (mpi_errno) { 
    #####:  458:		MPIU_ERR_POP(mpi_errno);
        -:  459:	    }
        -:  460:        }
        -:  461:        else {
     5874:  462:            mpi_errno = MPIR_Localcopy (((char *)recvbuf +
        -:  463:                                         rank * recvcount * recvtype_extent), 
        -:  464:                                        recvcount, recvtype, tmp_buf, 
        -:  465:                                        recvcount, recvtype);
     5874:  466:	    if (mpi_errno) { 
    #####:  467:		MPIU_ERR_POP(mpi_errno);
        -:  468:	    }
        -:  469:        }
        -:  470:        
        -:  471:        /* do the first \floor(\lg p) steps */
        -:  472:
   588775:  473:        curr_cnt = recvcount;
   588775:  474:        pof2 = 1;
  1773971:  475:        while (pof2 <= comm_size/2) {
   596421:  476:            src = (rank + pof2) % comm_size;
   596421:  477:            dst = (rank - pof2 + comm_size) % comm_size;
        -:  478:            
   596421:  479:            mpi_errno = MPIC_Sendrecv(tmp_buf, curr_cnt, recvtype, dst,
        -:  480:                                      MPIR_ALLGATHER_TAG,
        -:  481:                                  ((char *)tmp_buf + curr_cnt*recvtype_extent),
        -:  482:                                      curr_cnt, recvtype,
        -:  483:                                      src, MPIR_ALLGATHER_TAG, comm,
        -:  484:                                      MPI_STATUS_IGNORE);
   596421:  485:	    if (mpi_errno) { 
    #####:  486:		MPIU_ERR_POP(mpi_errno);
        -:  487:	    }
        -:  488:
   596421:  489:            curr_cnt *= 2;
   596421:  490:            pof2 *= 2;
        -:  491:        }
        -:  492:
        -:  493:        /* if comm_size is not a power of two, one more step is needed */
        -:  494:
   588775:  495:        rem = comm_size - pof2;
   588775:  496:        if (rem) {
   588775:  497:            src = (rank + pof2) % comm_size;
   588775:  498:            dst = (rank - pof2 + comm_size) % comm_size;
        -:  499:            
   588775:  500:            mpi_errno = MPIC_Sendrecv(tmp_buf, rem * recvcount, recvtype,
        -:  501:                                      dst, MPIR_ALLGATHER_TAG,
        -:  502:                                  ((char *)tmp_buf + curr_cnt*recvtype_extent),
        -:  503:                                      rem * recvcount, recvtype,
        -:  504:                                      src, MPIR_ALLGATHER_TAG, comm,
        -:  505:                                      MPI_STATUS_IGNORE);
   588775:  506:	    if (mpi_errno) { 
    #####:  507:		MPIU_ERR_POP(mpi_errno);
        -:  508:	    }
        -:  509:        }
        -:  510:
        -:  511:        /* Rotate blocks in tmp_buf down by (rank) blocks and store
        -:  512:         * result in recvbuf. */
        -:  513:        
   588775:  514:        mpi_errno = MPIR_Localcopy(tmp_buf, (comm_size-rank)*recvcount,
        -:  515:                  recvtype, (char *) recvbuf + rank*recvcount*recvtype_extent, 
        -:  516:                                       (comm_size-rank)*recvcount, recvtype);
   588775:  517:	if (mpi_errno) { 
    #####:  518:	    MPIU_ERR_POP(mpi_errno);
        -:  519:	}
        -:  520:
   588775:  521:        if (rank) {
   393602:  522:            mpi_errno = MPIR_Localcopy((char *) tmp_buf + 
        -:  523:                                   (comm_size-rank)*recvcount*recvtype_extent, 
        -:  524:                                       rank*recvcount, recvtype, recvbuf,
        -:  525:                                       rank*recvcount, recvtype);
   393602:  526:	    if (mpi_errno) { 
    #####:  527:		MPIU_ERR_POP(mpi_errno);
        -:  528:	    }
        -:  529:        }
        -:  530:    }
        -:  531:
        -:  532:    else {  /* long message or medium-size message and non-power-of-two
        -:  533:             * no. of processes. use ring algorithm. */
        -:  534:      
        -:  535:        /* First, load the "local" version in the recvbuf. */
      418:  536:        if (sendbuf != MPI_IN_PLACE) {
      209:  537:            mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype, 
        -:  538:                                       ((char *)recvbuf +
        -:  539:                                        rank*recvcount*recvtype_extent),  
        -:  540:                                       recvcount, recvtype);
      209:  541:	    if (mpi_errno) { 
    #####:  542:		MPIU_ERR_POP(mpi_errno);
        -:  543:	    }
        -:  544:        }
        -:  545:        
        -:  546:        /* 
        -:  547:           Now, send left to right.  This fills in the receive area in 
        -:  548:           reverse order.
        -:  549:        */
      418:  550:        left  = (comm_size + rank - 1) % comm_size;
      418:  551:        right = (rank + 1) % comm_size;
        -:  552:        
      418:  553:        j     = rank;
      418:  554:        jnext = left;
     3674:  555:        for (i=1; i<comm_size; i++) {
     3256:  556:            mpi_errno = MPIC_Sendrecv(((char *)recvbuf +
        -:  557:                                       j*recvcount*recvtype_extent), 
        -:  558:                                      recvcount, recvtype, right,
        -:  559:                                      MPIR_ALLGATHER_TAG, 
        -:  560:                                      ((char *)recvbuf +
        -:  561:                                       jnext*recvcount*recvtype_extent), 
        -:  562:                                      recvcount, recvtype, left, 
        -:  563:                                      MPIR_ALLGATHER_TAG, comm,
        -:  564:                                      MPI_STATUS_IGNORE);
     3256:  565:	    if (mpi_errno) { 
    #####:  566:		MPIU_ERR_POP(mpi_errno);
        -:  567:	    }
     3256:  568:            j	    = jnext;
     3256:  569:            jnext = (comm_size + jnext - 1) % comm_size;
        -:  570:        }
        -:  571:    }
        -:  572:
        -:  573:    /* check if multiple threads are calling this collective function */
        -:  574: fn_exit:
   588775:  575:    MPIU_CHKLMEM_FREEALL();
        -:  576:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );    
  2570802:  577:    return (mpi_errno);
        -:  578:
        -:  579: fn_fail:
        -:  580:    goto fn_exit;
        -:  581:}
        -:  582:/* end:nested */
        -:  583:
        -:  584:#undef FUNCNAME
        -:  585:#define FUNCNAME MPIR_Allgather_inter
        -:  586:#undef FCNAME
        -:  587:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  588:/* begin:nested */
        -:  589:/* not declared static because a machine-specific function may call this one 
        -:  590:   in some cases */
        -:  591:int MPIR_Allgather_inter ( 
        -:  592:    void *sendbuf, 
        -:  593:    int sendcount, 
        -:  594:    MPI_Datatype sendtype,
        -:  595:    void *recvbuf, 
        -:  596:    int recvcount, 
        -:  597:    MPI_Datatype recvtype, 
        -:  598:    MPID_Comm *comm_ptr )
     3145:  599:{
        -:  600:    /* Intercommunicator Allgather.
        -:  601:       Each group does a gather to local root with the local
        -:  602:       intracommunicator, and then does an intercommunicator broadcast.
        -:  603:    */
        -:  604:
     3145:  605:    int rank, local_size, remote_size, mpi_errno = MPI_SUCCESS, root;
     3145:  606:    MPI_Aint true_extent, true_lb = 0, extent, send_extent;
     3145:  607:    void *tmp_buf=NULL;
     3145:  608:    MPID_Comm *newcomm_ptr = NULL;
        -:  609:
     3145:  610:    MPIU_CHKLMEM_DECL(1);
        -:  611:
     3145:  612:    local_size = comm_ptr->local_size; 
     3145:  613:    remote_size = comm_ptr->remote_size;
     3145:  614:    rank = comm_ptr->rank;
        -:  615:
     3145:  616:    if ((rank == 0) && (sendcount != 0)) {
        -:  617:        /* In each group, rank 0 allocates temp. buffer for local
        -:  618:           gather */
      854:  619:        mpi_errno = NMPI_Type_get_true_extent(sendtype, &true_lb, &true_extent);
      854:  620:	if (mpi_errno) { 
    #####:  621:	    MPIU_ERR_POP(mpi_errno);
        -:  622:	}
      854:  623:        MPID_Datatype_get_extent_macro( sendtype, send_extent );
      854:  624:        extent = MPIR_MAX(send_extent, true_extent);
        -:  625:
        -:  626:	MPID_Ensure_Aint_fits_in_pointer(extent * sendcount * local_size);
      854:  627:        MPIU_CHKLMEM_MALLOC(tmp_buf, void*, extent*sendcount*local_size, mpi_errno, "tmp_buf");
        -:  628:
        -:  629:        /* adjust for potential negative lower bound in datatype */
      854:  630:        tmp_buf = (void *)((char*)tmp_buf - true_lb);
        -:  631:    }
        -:  632:
        -:  633:    /* Get the local intracommunicator */
     3145:  634:    if (!comm_ptr->local_comm)
       90:  635:	MPIR_Setup_intercomm_localcomm( comm_ptr );
        -:  636:
     3145:  637:    newcomm_ptr = comm_ptr->local_comm;
        -:  638:
     3145:  639:    if (sendcount != 0) {
     2649:  640:        mpi_errno = MPIR_Gather(sendbuf, sendcount, sendtype, tmp_buf, sendcount,
        -:  641:                                sendtype, 0, newcomm_ptr);
     2649:  642:	if (mpi_errno) { 
    #####:  643:	    MPIU_ERR_POP(mpi_errno);
        -:  644:	}
        -:  645:    }
        -:  646:
        -:  647:    /* first broadcast from left to right group, then from right to
        -:  648:       left group */
     3145:  649:    if (comm_ptr->is_low_group) {
        -:  650:        /* bcast to right*/
     1107:  651:        if (sendcount != 0) {
      611:  652:            root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
      611:  653:            mpi_errno = MPIR_Bcast_inter(tmp_buf, sendcount*local_size,
        -:  654:                                         sendtype, root, comm_ptr);
      611:  655:	    if (mpi_errno) { 
    #####:  656:		MPIU_ERR_POP(mpi_errno);
        -:  657:	    }
        -:  658:        }
        -:  659:
        -:  660:        /* receive bcast from right */
     1107:  661:        if (recvcount != 0) {
     1107:  662:            root = 0;
     1107:  663:            mpi_errno = MPIR_Bcast_inter(recvbuf, recvcount*remote_size,
        -:  664:                                         recvtype, root, comm_ptr);
     1107:  665:	    if (mpi_errno) { 
    #####:  666:		MPIU_ERR_POP(mpi_errno);
        -:  667:	    }
        -:  668:        }
        -:  669:    }
        -:  670:    else {
        -:  671:        /* receive bcast from left */
     2038:  672:        if (recvcount != 0) {
     1094:  673:            root = 0;
     1094:  674:            mpi_errno = MPIR_Bcast_inter(recvbuf, recvcount*remote_size,
        -:  675:                                         recvtype, root, comm_ptr);
     1094:  676:	    if (mpi_errno) { 
    #####:  677:		MPIU_ERR_POP(mpi_errno);
        -:  678:	    }
        -:  679:        }
        -:  680:
        -:  681:        /* bcast to left */
     2038:  682:        if (sendcount != 0) {
     2038:  683:            root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
     2038:  684:            mpi_errno = MPIR_Bcast_inter(tmp_buf, sendcount*local_size,
        -:  685:                                         sendtype, root, comm_ptr);
     2038:  686:	    if (mpi_errno) { 
    #####:  687:		MPIU_ERR_POP(mpi_errno);
        -:  688:	    }
        -:  689:        }
        -:  690:    }
        -:  691:
        -:  692:  fn_exit:    
      854:  693:    MPIU_CHKLMEM_FREEALL();
     3145:  694:    return mpi_errno;
        -:  695:
        -:  696:  fn_fail:
        -:  697:    goto fn_exit;
        -:  698:}
        -:  699:/* end:nested */
        -:  700:#endif
        -:  701:
        -:  702:#undef FUNCNAME
        -:  703:#define FUNCNAME MPI_Allgather
        -:  704:#undef FCNAME
        -:  705:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  706:/*@
        -:  707:MPI_Allgather - Gathers data from all tasks and distribute the combined
        -:  708:    data to all tasks
        -:  709:
        -:  710:Input Parameters:
        -:  711:+ sendbuf - starting address of send buffer (choice) 
        -:  712:. sendcount - number of elements in send buffer (integer) 
        -:  713:. sendtype - data type of send buffer elements (handle) 
        -:  714:. recvcount - number of elements received from any process (integer) 
        -:  715:. recvtype - data type of receive buffer elements (handle) 
        -:  716:- comm - communicator (handle) 
        -:  717:
        -:  718:Output Parameter:
        -:  719:. recvbuf - address of receive buffer (choice) 
        -:  720:
        -:  721:Notes:
        -:  722: The MPI standard (1.0 and 1.1) says that 
        -:  723:.n
        -:  724:.n
        -:  725: The jth block of data sent from  each proess is received by every process 
        -:  726: and placed in the jth block of the buffer 'recvbuf'.  
        -:  727:.n
        -:  728:.n
        -:  729: This is misleading; a better description is
        -:  730:.n
        -:  731:.n
        -:  732: The block of data sent from the jth process is received by every
        -:  733: process and placed in the jth block of the buffer 'recvbuf'.
        -:  734:.n
        -:  735:.n
        -:  736: This text was suggested by Rajeev Thakur and has been adopted as a 
        -:  737: clarification by the MPI Forum.
        -:  738:
        -:  739:.N ThreadSafe
        -:  740:
        -:  741:.N Fortran
        -:  742:
        -:  743:.N Errors
        -:  744:.N MPI_ERR_COMM
        -:  745:.N MPI_ERR_COUNT
        -:  746:.N MPI_ERR_TYPE
        -:  747:.N MPI_ERR_BUFFER
        -:  748:@*/
        -:  749:int MPI_Allgather(void *sendbuf, int sendcount, MPI_Datatype sendtype, 
        -:  750:                  void *recvbuf, int recvcount, MPI_Datatype recvtype, 
        -:  751:                  MPI_Comm comm)
  2574179:  752:{
  2574179:  753:    int mpi_errno = MPI_SUCCESS;
  2574179:  754:    MPID_Comm *comm_ptr = NULL;
  2574179:  755:    MPIU_THREADPRIV_DECL;
        -:  756:    MPID_MPI_STATE_DECL(MPID_STATE_MPI_ALLGATHER);
        -:  757:
  2574179:  758:    MPIR_ERRTEST_INITIALIZED_ORDIE();
        -:  759:    
  2574179:  760:    MPIU_THREAD_CS_ENTER(ALLFUNC,);
        -:  761:    MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_ALLGATHER);
        -:  762:
        -:  763:    /* Validate parameters, especially handles needing to be converted */
        -:  764:#   ifdef HAVE_ERROR_CHECKING
        -:  765:    {
        -:  766:        MPID_BEGIN_ERROR_CHECKS;
        -:  767:        {
  2574179:  768:	    MPIR_ERRTEST_COMM(comm, mpi_errno);
  2574179:  769:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -:  770:	}
        -:  771:        MPID_END_ERROR_CHECKS;
        -:  772:    }
        -:  773:#   endif /* HAVE_ERROR_CHECKING */
        -:  774:
        -:  775:    /* Convert MPI object handles to object pointers */
  2574177:  776:    MPID_Comm_get_ptr( comm, comm_ptr );
        -:  777:
        -:  778:    /* Validate parameters and objects (post conversion) */
        -:  779:#   ifdef HAVE_ERROR_CHECKING
        -:  780:    {
        -:  781:        MPID_BEGIN_ERROR_CHECKS;
        -:  782:        {
  2574177:  783:            MPID_Datatype *recvtype_ptr=NULL, *sendtype_ptr=NULL;
        -:  784:
  2574177:  785:            MPID_Comm_valid_ptr( comm_ptr, mpi_errno );
  2574177:  786:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -:  787:
  2574171:  788:	    if (comm_ptr->comm_kind == MPID_INTERCOMM)
     3145:  789:                MPIR_ERRTEST_SENDBUF_INPLACE(sendbuf, sendcount, mpi_errno);
  2574171:  790:            if (sendbuf != MPI_IN_PLACE)
        -:  791:	    {
  2139817:  792:                MPIR_ERRTEST_COUNT(sendcount, mpi_errno);
  2139817:  793:                MPIR_ERRTEST_DATATYPE(sendtype, "sendtype", mpi_errno);
  2139817:  794:                if (HANDLE_GET_KIND(sendtype) != HANDLE_KIND_BUILTIN)
        -:  795:		{
      362:  796:                    MPID_Datatype_get_ptr(sendtype, sendtype_ptr);
      362:  797:                    MPID_Datatype_valid_ptr( sendtype_ptr, mpi_errno );
      362:  798:                    MPID_Datatype_committed_ptr( sendtype_ptr, mpi_errno );
        -:  799:                }
  2139817:  800:                MPIR_ERRTEST_USERBUFFER(sendbuf,sendcount,sendtype,mpi_errno);
        -:  801:            }
        -:  802:
  2574171:  803:            MPIR_ERRTEST_RECVBUF_INPLACE(recvbuf, recvcount, mpi_errno);
  2574171:  804:	    MPIR_ERRTEST_COUNT(recvcount, mpi_errno);
  2574171:  805:	    MPIR_ERRTEST_DATATYPE(recvtype, "recvtype", mpi_errno);
  2574171:  806:            if (HANDLE_GET_KIND(recvtype) != HANDLE_KIND_BUILTIN)
        -:  807:	    {
      362:  808:                MPID_Datatype_get_ptr(recvtype, recvtype_ptr);
      362:  809:                MPID_Datatype_valid_ptr( recvtype_ptr, mpi_errno );
      362:  810:                MPID_Datatype_committed_ptr( recvtype_ptr, mpi_errno );
        -:  811:            }
  2574171:  812:	    MPIR_ERRTEST_USERBUFFER(recvbuf,recvcount,recvtype,mpi_errno);
        -:  813:
  2574171:  814:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -:  815:        }
        -:  816:        MPID_END_ERROR_CHECKS;
        -:  817:    }
        -:  818:#   endif /* HAVE_ERROR_CHECKING */
        -:  819:
        -:  820:    /* ... body of routine ...  */
        -:  821:
  2574167:  822:    if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Allgather != NULL)
        -:  823:    {
    #####:  824:	mpi_errno = comm_ptr->coll_fns->Allgather(sendbuf, sendcount,
        -:  825:                                                  sendtype, recvbuf, recvcount,
        -:  826:                                                  recvtype, comm_ptr);
        -:  827:    }
        -:  828:    else
        -:  829:    {
  2574167:  830:	MPIU_THREADPRIV_GET;
        -:  831:
  2574167:  832:	MPIR_Nest_incr();
  2574167:  833:        if (comm_ptr->comm_kind == MPID_INTRACOMM) 
        -:  834:            /* intracommunicator */
  2571022:  835:            mpi_errno = MPIR_Allgather(sendbuf, sendcount, sendtype,
        -:  836:                                       recvbuf, recvcount, recvtype,
        -:  837:                                       comm_ptr);
        -:  838:        else {
        -:  839:            /* intercommunicator */
     3145:  840:            mpi_errno = MPIR_Allgather_inter(sendbuf, sendcount, sendtype,
        -:  841:                                             recvbuf, recvcount, recvtype,
        -:  842:                                             comm_ptr);            
        -:  843:        }
  2574167:  844:	MPIR_Nest_decr();
        -:  845:    }
        -:  846:
  2574167:  847:    if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -:  848:
        -:  849:    /* ... end of body of routine ... */
        -:  850:    
  2574179:  851:  fn_exit:
        -:  852:    MPID_MPI_COLL_FUNC_EXIT(MPID_STATE_MPI_ALLGATHER);
  2574179:  853:    MPIU_THREAD_CS_EXIT(ALLFUNC,);
  2574179:  854:    return mpi_errno;
        -:  855:
       12:  856:  fn_fail:
        -:  857:    /* --BEGIN ERROR HANDLING-- */
        -:  858:#   ifdef HAVE_ERROR_CHECKING
        -:  859:    {
       12:  860:	mpi_errno = MPIR_Err_create_code(
        -:  861:	    mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_allgather",
        -:  862:	    "**mpi_allgather %p %d %D %p %d %D %C", sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm);
        -:  863:    }
        -:  864:#   endif
       12:  865:    mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno );
       12:  866:    goto fn_exit;
        -:  867:    /* --END ERROR HANDLING-- */
        -:  868:}