-:    0:Source:/home/MPI/testing/mpich2/mpich2/src/mpi/coll/alltoall.c
        -:    0:Graph:alltoall.gcno
        -:    0:Data:alltoall.gcda
        -:    0:Runs:681
        -:    0:Programs:186
        -:    1:/* -*- Mode: C; c-basic-offset:4 ; -*- */
        -:    2:/*
        -:    3: *
        -:    4: *  (C) 2001 by Argonne National Laboratory.
        -:    5: *      See COPYRIGHT in top-level directory.
        -:    6: */
        -:    7:
        -:    8:#include "mpiimpl.h"
        -:    9:
        -:   10:/* -- Begin Profiling Symbol Block for routine MPI_Alltoall */
        -:   11:#if defined(HAVE_PRAGMA_WEAK)
        -:   12:#pragma weak MPI_Alltoall = PMPI_Alltoall
        -:   13:#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
        -:   14:#pragma _HP_SECONDARY_DEF PMPI_Alltoall  MPI_Alltoall
        -:   15:#elif defined(HAVE_PRAGMA_CRI_DUP)
        -:   16:#pragma _CRI duplicate MPI_Alltoall as PMPI_Alltoall
        -:   17:#endif
        -:   18:/* -- End Profiling Symbol Block */
        -:   19:
        -:   20:/* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
        -:   21:   the MPI routines */
        -:   22:#ifndef MPICH_MPI_FROM_PMPI
        -:   23:#undef MPI_Alltoall
        -:   24:#define MPI_Alltoall PMPI_Alltoall
        -:   25:
        -:   26:/* This is the default implementation of alltoall. The algorithm is:
        -:   27:   
        -:   28:   Algorithm: MPI_Alltoall
        -:   29:
        -:   30:   We use four algorithms for alltoall. For short messages and
        -:   31:   (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al,
        -:   32:   IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that
        -:   33:   takes lgp steps. Because of the extra communication, the bandwidth
        -:   34:   requirement is (n/2).lgp.beta.
        -:   35:
        -:   36:   Cost = lgp.alpha + (n/2).lgp.beta
        -:   37:
        -:   38:   where n is the total amount of data a process needs to send to all
        -:   39:   other processes.
        -:   40:
        -:   41:   For medium size messages and (short messages for comm_size < 8), we
        -:   42:   use an algorithm that posts all irecvs and isends and then does a
        -:   43:   waitall. We scatter the order of sources and destinations among the
        -:   44:   processes, so that all processes don't try to send/recv to/from the
        -:   45:   same process at the same time.
        -:   46:
        -:   47:   *** Modification: We post only a small number of isends and irecvs 
        -:   48:   at a time and wait on them as suggested by Tony Ladd. ***
        -:   49:
        -:   50:   For long messages and power-of-two number of processes, we use a
        -:   51:   pairwise exchange algorithm, which takes p-1 steps. We
        -:   52:   calculate the pairs by using an exclusive-or algorithm:
        -:   53:           for (i=1; i<comm_size; i++)
        -:   54:               dest = rank ^ i;
        -:   55:   This algorithm doesn't work if the number of processes is not a power of
        -:   56:   two. For a non-power-of-two number of processes, we use an
        -:   57:   algorithm in which, in step i, each process  receives from (rank-i)
        -:   58:   and sends to (rank+i). 
        -:   59:
        -:   60:   Cost = (p-1).alpha + n.beta
        -:   61:
        -:   62:   where n is the total amount of data a process needs to send to all
        -:   63:   other processes.
        -:   64:
        -:   65:   Possible improvements: 
        -:   66:
        -:   67:   End Algorithm: MPI_Alltoall
        -:   68:*/
        -:   69:
        -:   70:/* begin:nested */
        -:   71:/* not declared static because a machine-specific function may call this one in some cases */
        -:   72:int MPIR_Alltoall( 
        -:   73:    void *sendbuf, 
        -:   74:    int sendcount, 
        -:   75:    MPI_Datatype sendtype, 
        -:   76:    void *recvbuf, 
        -:   77:    int recvcount, 
        -:   78:    MPI_Datatype recvtype, 
        -:   79:    MPID_Comm *comm_ptr )
     4460:   80:{
        -:   81:    static const char FCNAME[] = "MPIR_Alltoall";
        -:   82:    int          comm_size, i, j, pof2;
        -:   83:    MPI_Aint     sendtype_extent, recvtype_extent;
        -:   84:    MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb;
     4460:   85:    int mpi_errno=MPI_SUCCESS, src, dst, rank, nbytes;
        -:   86:    MPI_Status status;
        -:   87:    int sendtype_size, pack_size, block, position, *displs, count;
        -:   88:    MPI_Datatype newtype;
        -:   89:    void *tmp_buf;
        -:   90:    MPI_Comm comm;
        -:   91:    MPI_Request *reqarray;
        -:   92:    MPI_Status *starray;
        -:   93:#ifdef MPIR_OLD_SHORT_ALLTOALL_ALG
        -:   94:    MPI_Aint sendtype_true_extent, sendbuf_extent, sendtype_true_lb;
        -:   95:    int k, p, curr_cnt, dst_tree_root, my_tree_root;
        -:   96:    int last_recv_cnt, mask, tmp_mask, tree_root, nprocs_completed;
        -:   97:#endif
        -:   98:
     4460:   99:    if (recvcount == 0) return MPI_SUCCESS;
        -:  100:
     4250:  101:    comm = comm_ptr->handle;
     4250:  102:    comm_size = comm_ptr->local_size;
     4250:  103:    rank = comm_ptr->rank;
        -:  104:
        -:  105:    /* Get extent of send and recv types */
     4250:  106:    MPID_Datatype_get_extent_macro(recvtype, recvtype_extent);
     4250:  107:    MPID_Datatype_get_extent_macro(sendtype, sendtype_extent);
        -:  108:
     4250:  109:    MPID_Datatype_get_size_macro(sendtype, sendtype_size);
     4250:  110:    nbytes = sendtype_size * sendcount;
        -:  111:
        -:  112:    /* check if multiple threads are calling this collective function */
        -:  113:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
        -:  114:
     4250:  115:    if (sendbuf == MPI_IN_PLACE) {
        -:  116:        /* We use pair-wise sendrecv_replace in order to conserve memory usage,
        -:  117:         * which is keeping with the spirit of the MPI-2.2 Standard.  But
        -:  118:         * because of this approach all processes must agree on the global
        -:  119:         * schedule of sendrecv_replace operations to avoid deadlock.
        -:  120:         *
        -:  121:         * Note that this is not an especially efficient algorithm in terms of
        -:  122:         * time and there will be multiple repeated malloc/free's rather than
        -:  123:         * maintaining a single buffer across the whole loop.  Something like
        -:  124:         * MADRE is probably the best solution for the MPI_IN_PLACE scenario. */
     6144:  125:        for (i = 0; i < comm_size; ++i) {
        -:  126:            /* start inner loop at i to avoid re-exchanging data */
    26800:  127:            for (j = i; j < comm_size; ++j) {
    21456:  128:                if (rank == i) {
        -:  129:                    /* also covers the (rank == i && rank == j) case */
     3072:  130:                    mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + j*recvcount*recvtype_extent),
        -:  131:                                                      recvcount, recvtype,
        -:  132:                                                      j, MPIR_ALLTOALL_TAG,
        -:  133:                                                      j, MPIR_ALLTOALL_TAG,
        -:  134:                                                      comm, &status);
     3072:  135:                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  136:                }
    18384:  137:                else if (rank == j) {
        -:  138:                    /* same as above with i/j args reversed */
     2272:  139:                    mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + i*recvcount*recvtype_extent),
        -:  140:                                                      recvcount, recvtype,
        -:  141:                                                      i, MPIR_ALLTOALL_TAG,
        -:  142:                                                      i, MPIR_ALLTOALL_TAG,
        -:  143:                                                      comm, &status);
     2272:  144:                    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  145:                }
        -:  146:            }
        -:  147:        }
        -:  148:    }
     3450:  149:    else if ((nbytes <= MPIR_ALLTOALL_SHORT_MSG) && (comm_size >= 8)) {
        -:  150:
        -:  151:        /* use the indexing algorithm by Jehoshua Bruck et al,
        -:  152:         * IEEE TPDS, Nov. 97 */ 
        -:  153:
        -:  154:        /* allocate temporary buffer */
      168:  155:        NMPI_Pack_size(recvcount*comm_size, recvtype, comm, &pack_size);
      168:  156:        tmp_buf = MPIU_Malloc(pack_size);
        -:  157:	/* --BEGIN ERROR HANDLING-- */
      168:  158:        if (!tmp_buf) {
    #####:  159:            mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 );
    #####:  160:            return mpi_errno;
        -:  161:        }
        -:  162:	/* --END ERROR HANDLING-- */
        -:  163:
        -:  164:        /* Do Phase 1 of the algorithim. Shift the data blocks on process i
        -:  165:         * upwards by a distance of i blocks. Store the result in recvbuf. */
      168:  166:        mpi_errno = MPIR_Localcopy((char *) sendbuf + 
        -:  167:			   rank*sendcount*sendtype_extent, 
        -:  168:                           (comm_size - rank)*sendcount, sendtype, recvbuf, 
        -:  169:                           (comm_size - rank)*recvcount, recvtype);
      168:  170:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
      168:  171:        mpi_errno = MPIR_Localcopy(sendbuf, rank*sendcount, sendtype, 
        -:  172:                        (char *) recvbuf + 
        -:  173:				   (comm_size-rank)*recvcount*recvtype_extent, 
        -:  174:                                   rank*recvcount, recvtype);
      168:  175:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  176:        /* Input data is now stored in recvbuf with datatype recvtype */
        -:  177:
        -:  178:        /* Now do Phase 2, the communication phase. It takes
        -:  179:           ceiling(lg p) steps. In each step i, each process sends to rank+2^i
        -:  180:           and receives from rank-2^i, and exchanges all data blocks
        -:  181:           whose ith bit is 1. */
        -:  182:
        -:  183:        /* allocate displacements array for indexed datatype used in
        -:  184:           communication */
        -:  185:
      168:  186:        displs = MPIU_Malloc(comm_size * sizeof(int));
        -:  187:	/* --BEGIN ERROR HANDLING-- */
      168:  188:        if (!displs) {
    #####:  189:            mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 );
    #####:  190:            return mpi_errno;
        -:  191:        }
        -:  192:	/* --END ERROR HANDLING-- */
        -:  193:
      168:  194:        pof2 = 1;
      840:  195:        while (pof2 < comm_size) {
      504:  196:            dst = (rank + pof2) % comm_size;
      504:  197:            src = (rank - pof2 + comm_size) % comm_size;
        -:  198:
        -:  199:            /* Exchange all data blocks whose ith bit is 1 */
        -:  200:            /* Create an indexed datatype for the purpose */
        -:  201:
      504:  202:            count = 0;
     4032:  203:            for (block=1; block<comm_size; block++) {
     3528:  204:                if (block & pof2) {
     2016:  205:                    displs[count] = block * recvcount;
     2016:  206:                    count++;
        -:  207:                }
        -:  208:            }
        -:  209:
      504:  210:            mpi_errno = NMPI_Type_create_indexed_block(count, recvcount, 
        -:  211:                                               displs, recvtype, &newtype);
      504:  212:	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  213:
      504:  214:            mpi_errno = NMPI_Type_commit(&newtype);
      504:  215:	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  216:
      504:  217:            position = 0;
      504:  218:            mpi_errno = NMPI_Pack(recvbuf, 1, newtype, tmp_buf, pack_size, 
        -:  219:                                  &position, comm);
        -:  220:
      504:  221:            mpi_errno = MPIC_Sendrecv(tmp_buf, position, MPI_PACKED, dst,
        -:  222:                                      MPIR_ALLTOALL_TAG, recvbuf, 1, newtype,
        -:  223:                                      src, MPIR_ALLTOALL_TAG, comm,
        -:  224:                                      MPI_STATUS_IGNORE);
      504:  225:	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  226:
      504:  227:            mpi_errno = NMPI_Type_free(&newtype);
      504:  228:	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  229:
      504:  230:            pof2 *= 2;
        -:  231:        }
        -:  232:
      168:  233:        MPIU_Free(displs);
      168:  234:        MPIU_Free(tmp_buf);
        -:  235:
        -:  236:        /* Rotate blocks in recvbuf upwards by (rank + 1) blocks. Need
        -:  237:         * a temporary buffer of the same size as recvbuf. */
        -:  238:        
        -:  239:        /* get true extent of recvtype */
      168:  240:        mpi_errno = NMPI_Type_get_true_extent(recvtype, &recvtype_true_lb,
        -:  241:                                              &recvtype_true_extent);  
      168:  242:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  243:
      168:  244:        recvbuf_extent = recvcount * comm_size *
        -:  245:            (MPIR_MAX(recvtype_true_extent, recvtype_extent));
      168:  246:        tmp_buf = MPIU_Malloc(recvbuf_extent);
        -:  247:	/* --BEGIN ERROR HANDLING-- */
      168:  248:        if (!tmp_buf) {
    #####:  249:            mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 );
    #####:  250:            return mpi_errno;
        -:  251:        }
        -:  252:	/* --END ERROR HANDLING-- */
        -:  253:        /* adjust for potential negative lower bound in datatype */
      168:  254:        tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb);
        -:  255:
      168:  256:        mpi_errno = MPIR_Localcopy((char *) recvbuf + (rank+1)*recvcount*recvtype_extent, 
        -:  257:                       (comm_size - rank - 1)*recvcount, recvtype, tmp_buf, 
        -:  258:                       (comm_size - rank - 1)*recvcount, recvtype);
      168:  259:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
      168:  260:        mpi_errno = MPIR_Localcopy(recvbuf, (rank+1)*recvcount, recvtype, 
        -:  261:                       (char *) tmp_buf + (comm_size-rank-1)*recvcount*recvtype_extent, 
        -:  262:                       (rank+1)*recvcount, recvtype);
      168:  263:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  264:
        -:  265:        /* Blocks are in the reverse order now (comm_size-1 to 0). 
        -:  266:         * Reorder them to (0 to comm_size-1) and store them in recvbuf. */
        -:  267:
     1512:  268:        for (i=0; i<comm_size; i++) 
     1344:  269:            MPIR_Localcopy((char *) tmp_buf + i*recvcount*recvtype_extent,
        -:  270:                           recvcount, recvtype, 
        -:  271:                           (char *) recvbuf + (comm_size-i-1)*recvcount*recvtype_extent, 
        -:  272:                           recvcount, recvtype); 
        -:  273:
      168:  274:        MPIU_Free((char*)tmp_buf + recvtype_true_lb);
        -:  275:
        -:  276:
        -:  277:
        -:  278:#ifdef MPIR_OLD_SHORT_ALLTOALL_ALG
        -:  279:        /* Short message. Use recursive doubling. Each process sends all
        -:  280:           its data at each step along with all data it received in
        -:  281:           previous steps. */
        -:  282:        
        -:  283:        /* need to allocate temporary buffer of size
        -:  284:           sendbuf_extent*comm_size */
        -:  285:        
        -:  286:        /* get true extent of sendtype */
        -:  287:        mpi_errno = NMPI_Type_get_true_extent(sendtype, &sendtype_true_lb,
        -:  288:                                              &sendtype_true_extent);  
        -:  289:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  290:
        -:  291:        sendbuf_extent = sendcount * comm_size *
        -:  292:            (MPIR_MAX(sendtype_true_extent, sendtype_extent));
        -:  293:        tmp_buf = MPIU_Malloc(sendbuf_extent*comm_size);
        -:  294:	/* --BEGIN ERROR HANDLING-- */
        -:  295:        if (!tmp_buf) {
        -:  296:            mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 );
        -:  297:            return mpi_errno;
        -:  298:        }
        -:  299:	/* --END ERROR HANDLING-- */
        -:  300:        
        -:  301:        /* adjust for potential negative lower bound in datatype */
        -:  302:        tmp_buf = (void *)((char*)tmp_buf - sendtype_true_lb);
        -:  303:        
        -:  304:        /* copy local sendbuf into tmp_buf at location indexed by rank */
        -:  305:        curr_cnt = sendcount*comm_size;
        -:  306:        mpi_errno = MPIR_Localcopy(sendbuf, curr_cnt, sendtype,
        -:  307:                                   ((char *)tmp_buf + rank*sendbuf_extent),
        -:  308:                                   curr_cnt, sendtype);
        -:  309:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno);}
        -:  310:        
        -:  311:        mask = 0x1;
        -:  312:        i = 0;
        -:  313:        while (mask < comm_size) {
        -:  314:            dst = rank ^ mask;
        -:  315:            
        -:  316:            dst_tree_root = dst >> i;
        -:  317:            dst_tree_root <<= i;
        -:  318:            
        -:  319:            my_tree_root = rank >> i;
        -:  320:            my_tree_root <<= i;
        -:  321:            
        -:  322:            if (dst < comm_size) {
        -:  323:                mpi_errno = MPIC_Sendrecv(((char *)tmp_buf +
        -:  324:                                           my_tree_root*sendbuf_extent),
        -:  325:                                          curr_cnt, sendtype,
        -:  326:                                          dst, MPIR_ALLTOALL_TAG, 
        -:  327:                                          ((char *)tmp_buf +
        -:  328:                                           dst_tree_root*sendbuf_extent),
        -:  329:					  sendbuf_extent*(comm_size-dst_tree_root),
        -:  330:                                          sendtype, dst, MPIR_ALLTOALL_TAG, 
        -:  331:                                          comm, &status);
        -:  332:		if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  333:                
        -:  334:                /* in case of non-power-of-two nodes, less data may be
        -:  335:                   received than specified */
        -:  336:                NMPI_Get_count(&status, sendtype, &last_recv_cnt);
        -:  337:                curr_cnt += last_recv_cnt;
        -:  338:            }
        -:  339:            
        -:  340:            /* if some processes in this process's subtree in this step
        -:  341:               did not have any destination process to communicate with
        -:  342:               because of non-power-of-two, we need to send them the
        -:  343:               result. We use a logarithmic recursive-halfing algorithm
        -:  344:               for this. */
        -:  345:            
        -:  346:            if (dst_tree_root + mask > comm_size) {
        -:  347:                nprocs_completed = comm_size - my_tree_root - mask;
        -:  348:                /* nprocs_completed is the number of processes in this
        -:  349:                   subtree that have all the data. Send data to others
        -:  350:                   in a tree fashion. First find root of current tree
        -:  351:                   that is being divided into two. k is the number of
        -:  352:                   least-significant bits in this process's rank that
        -:  353:                   must be zeroed out to find the rank of the root */ 
        -:  354:                j = mask;
        -:  355:                k = 0;
        -:  356:                while (j) {
        -:  357:                    j >>= 1;
        -:  358:                    k++;
        -:  359:                }
        -:  360:                k--;
        -:  361:                
        -:  362:                tmp_mask = mask >> 1;
        -:  363:                while (tmp_mask) {
        -:  364:                    dst = rank ^ tmp_mask;
        -:  365:                    
        -:  366:                    tree_root = rank >> k;
        -:  367:                    tree_root <<= k;
        -:  368:                    
        -:  369:                    /* send only if this proc has data and destination
        -:  370:                       doesn't have data. at any step, multiple processes
        -:  371:                       can send if they have the data */
        -:  372:                    if ((dst > rank) && 
        -:  373:                        (rank < tree_root + nprocs_completed)
        -:  374:                        && (dst >= tree_root + nprocs_completed)) {
        -:  375:                        /* send the data received in this step above */
        -:  376:                        mpi_errno = MPIC_Send(((char *)tmp_buf +
        -:  377:                                               dst_tree_root*sendbuf_extent),
        -:  378:                                              last_recv_cnt, sendtype,
        -:  379:                                              dst, MPIR_ALLTOALL_TAG,
        -:  380:                                              comm);  
        -:  381:			if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  382:                    }
        -:  383:                    /* recv only if this proc. doesn't have data and sender
        -:  384:                       has data */
        -:  385:                    else if ((dst < rank) && 
        -:  386:                             (dst < tree_root + nprocs_completed) &&
        -:  387:                             (rank >= tree_root + nprocs_completed)) {
        -:  388:                        mpi_errno = MPIC_Recv(((char *)tmp_buf +
        -:  389:                                               dst_tree_root*sendbuf_extent),
        -:  390:					      sendbuf_extent*(comm_size-dst_tree_root),
        -:  391:                                              sendtype,   
        -:  392:                                              dst, MPIR_ALLTOALL_TAG,
        -:  393:                                              comm, &status); 
        -:  394:			if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  395:                        NMPI_Get_count(&status, sendtype, &last_recv_cnt);
        -:  396:                        curr_cnt += last_recv_cnt;
        -:  397:                    }
        -:  398:                    tmp_mask >>= 1;
        -:  399:                    k--;
        -:  400:                }
        -:  401:            }
        -:  402:            
        -:  403:            mask <<= 1;
        -:  404:            i++;
        -:  405:        }
        -:  406:        
        -:  407:        /* now copy everyone's contribution from tmp_buf to recvbuf */
        -:  408:        for (p=0; p<comm_size; p++) {
        -:  409:            mpi_errno = MPIR_Localcopy(((char *)tmp_buf +
        -:  410:                                        p*sendbuf_extent +
        -:  411:                                        rank*sendcount*sendtype_extent),
        -:  412:                                        sendcount, sendtype, 
        -:  413:                                        ((char*)recvbuf +
        -:  414:                                         p*recvcount*recvtype_extent), 
        -:  415:                                        recvcount, recvtype);
        -:  416:	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  417:        }
        -:  418:        
        -:  419:        MPIU_Free((char *)tmp_buf+sendtype_true_lb); 
        -:  420:#endif
        -:  421:
        -:  422:    }
        -:  423:
     3282:  424:    else if (nbytes <= MPIR_ALLTOALL_MEDIUM_MSG) {  
        -:  425:        /* Medium-size message. Use isend/irecv with scattered
        -:  426:           destinations. Use Tony Ladd's modification to post only
        -:  427:           a small number of isends/irecvs at a time. */
        -:  428:        int ii, ss, bblock;
        -:  429:
     3182:  430:        bblock = MPIR_ALLTOALL_THROTTLE;
     3182:  431:        if (bblock == 0) bblock = comm_size;
        -:  432:
     3182:  433:        reqarray = (MPI_Request *) MPIU_Malloc(2*bblock*sizeof(MPI_Request));
        -:  434:        /* --BEGIN ERROR HANDLING-- */
     3182:  435:        if (!reqarray) {
    #####:  436:            mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 );
    #####:  437:            return mpi_errno;
        -:  438:        }
        -:  439:        /* --END ERROR HANDLING-- */
        -:  440:
     3182:  441:        starray = (MPI_Status *) MPIU_Malloc(2*bblock*sizeof(MPI_Status));
        -:  442:        /* --BEGIN ERROR HANDLING-- */
     3182:  443:        if (!starray) {
    #####:  444:            mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 );
    #####:  445:            return mpi_errno;
        -:  446:        }
        -:  447:        /* --END ERROR HANDLING-- */
        -:  448:
     6790:  449:        for (ii=0; ii<comm_size; ii+=bblock) {
     3608:  450:            ss = comm_size-ii < bblock ? comm_size-ii : bblock;
        -:  451:            /* do the communication -- post ss sends and receives: */
    11948:  452:            for ( i=0; i<ss; i++ ) { 
     8340:  453:                dst = (rank+i+ii) % comm_size;
     8340:  454:                mpi_errno = MPIC_Irecv((char *)recvbuf +
        -:  455:                                       dst*recvcount*recvtype_extent, 
        -:  456:                                       recvcount, recvtype, dst,
        -:  457:                                       MPIR_ALLTOALL_TAG, comm,
        -:  458:                                       &reqarray[i]);
     8340:  459:                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  460:            }
        -:  461:
    11948:  462:            for ( i=0; i<ss; i++ ) { 
     8340:  463:                dst = (rank-i-ii+comm_size) % comm_size;
     8340:  464:                mpi_errno = MPIC_Isend((char *)sendbuf +
        -:  465:                                       dst*sendcount*sendtype_extent, 
        -:  466:                                       sendcount, sendtype, dst,
        -:  467:                                       MPIR_ALLTOALL_TAG, comm,
        -:  468:                                       &reqarray[i+ss]);
     8340:  469:                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  470:            }
        -:  471:  
        -:  472:            /* ... then wait for them to finish: */
     3608:  473:            mpi_errno = NMPI_Waitall(2*ss,reqarray,starray);
        -:  474:            /* --BEGIN ERROR HANDLING-- */
     3608:  475:            if (mpi_errno == MPI_ERR_IN_STATUS) {
    #####:  476:                for (j=0; j<2*ss; j++) {
    #####:  477:                    if (starray[j].MPI_ERROR != MPI_SUCCESS) 
    #####:  478:                        mpi_errno = starray[j].MPI_ERROR;
        -:  479:                }
        -:  480:            }
        -:  481:            /* --END ERROR HANDLING-- */
        -:  482:        }
     3182:  483:        MPIU_Free(starray);
     3182:  484:        MPIU_Free(reqarray);
        -:  485:    }
        -:  486:
        -:  487:    else {
        -:  488:        /* Long message. If comm_size is a power-of-two, do a pairwise
        -:  489:           exchange using exclusive-or to create pairs. Else send to
        -:  490:           rank+i, receive from rank-i. */
        -:  491:        
        -:  492:        /* Make local copy first */
      100:  493:        mpi_errno = MPIR_Localcopy(((char *)sendbuf + 
        -:  494:                                    rank*sendcount*sendtype_extent), 
        -:  495:                                   sendcount, sendtype, 
        -:  496:                                   ((char *)recvbuf +
        -:  497:                                    rank*recvcount*recvtype_extent),
        -:  498:                                   recvcount, recvtype);
      100:  499:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno ); }
        -:  500:
        -:  501:        /* Is comm_size a power-of-two? */
      100:  502:        i = 1;
      484:  503:        while (i < comm_size)
      284:  504:            i *= 2;
      100:  505:        if (i == comm_size)
       64:  506:            pof2 = 1;
        -:  507:        else 
       36:  508:            pof2 = 0;
        -:  509:
        -:  510:        /* Do the pairwise exchanges */
      668:  511:        for (i=1; i<comm_size; i++) {
      568:  512:            if (pof2 == 1) {
        -:  513:                /* use exclusive-or algorithm */
      384:  514:                src = dst = rank ^ i;
        -:  515:            }
        -:  516:            else {
      184:  517:                src = (rank - i + comm_size) % comm_size;
      184:  518:                dst = (rank + i) % comm_size;
        -:  519:            }
        -:  520:
      568:  521:            mpi_errno = MPIC_Sendrecv(((char *)sendbuf +
        -:  522:                                       dst*sendcount*sendtype_extent), 
        -:  523:                                      sendcount, sendtype, dst,
        -:  524:                                      MPIR_ALLTOALL_TAG, 
        -:  525:                                      ((char *)recvbuf +
        -:  526:                                       src*recvcount*recvtype_extent),
        -:  527:                                      recvcount, recvtype, src,
        -:  528:                                      MPIR_ALLTOALL_TAG, comm, &status);
      568:  529:	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  530:        }
        -:  531:    }
        -:  532:
     4250:  533: fn_fail:    
        -:  534:    /* check if multiple threads are calling this collective function */
        -:  535:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
        -:  536:    
     4250:  537:    return (mpi_errno);
        -:  538:}
        -:  539:/* end:nested */
        -:  540:
        -:  541:/* begin:nested */
        -:  542:/* not declared static because a machine-specific function may call this one in some cases */
        -:  543:int MPIR_Alltoall_inter( 
        -:  544:    void *sendbuf, 
        -:  545:    int sendcount, 
        -:  546:    MPI_Datatype sendtype, 
        -:  547:    void *recvbuf, 
        -:  548:    int recvcount, 
        -:  549:    MPI_Datatype recvtype, 
        -:  550:    MPID_Comm *comm_ptr )
     1530:  551:{
        -:  552:/* Intercommunicator alltoall. We use a pairwise exchange algorithm
        -:  553:   similar to the one used in intracommunicator alltoall for long
        -:  554:   messages. Since the local and remote groups can be of different
        -:  555:   sizes, we first compute the max of local_group_size,
        -:  556:   remote_group_size. At step i, 0 <= i < max_size, each process
        -:  557:   receives from src = (rank - i + max_size) % max_size if src <
        -:  558:   remote_size, and sends to dst = (rank + i) % max_size if dst <
        -:  559:   remote_size. 
        -:  560:*/
        -:  561:    static const char FCNAME[] = "MPIR_Alltoall_inter";
        -:  562:    int          local_size, remote_size, max_size, i;
        -:  563:    MPI_Aint     sendtype_extent, recvtype_extent;
     1530:  564:    int          mpi_errno = MPI_SUCCESS;
        -:  565:    MPI_Status status;
        -:  566:    int src, dst, rank;
        -:  567:    char *sendaddr, *recvaddr;
        -:  568:    MPI_Comm comm;
        -:  569:    
     1530:  570:    local_size = comm_ptr->local_size; 
     1530:  571:    remote_size = comm_ptr->remote_size;
     1530:  572:    rank = comm_ptr->rank;
     1530:  573:    comm = comm_ptr->handle;
        -:  574:
        -:  575:    /* Get extent of send and recv types */
     1530:  576:    MPID_Datatype_get_extent_macro(sendtype, sendtype_extent);
     1530:  577:    MPID_Datatype_get_extent_macro(recvtype, recvtype_extent);
        -:  578:    
        -:  579:    /* check if multiple threads are calling this collective function */
        -:  580:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
        -:  581:
        -:  582:    /* Do the pairwise exchanges */
     1530:  583:    max_size = MPIR_MAX(local_size, remote_size);
        -:  584:    MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT recvbuf +
        -:  585:				     max_size*recvcount*recvtype_extent);
        -:  586:    MPID_Ensure_Aint_fits_in_pointer(MPI_VOID_PTR_CAST_TO_MPI_AINT sendbuf +
        -:  587:				     max_size*sendcount*sendtype_extent);
     7344:  588:    for (i=0; i<max_size; i++) {
     5814:  589:        src = (rank - i + max_size) % max_size;
     5814:  590:        dst = (rank + i) % max_size;
     5814:  591:        if (src >= remote_size) {
     1972:  592:            src = MPI_PROC_NULL;
     1972:  593:            recvaddr = NULL;
        -:  594:        }
        -:  595:        else {
     3842:  596:            recvaddr = (char *)recvbuf + src*recvcount*recvtype_extent;
        -:  597:        }
     5814:  598:        if (dst >= remote_size) {
     1972:  599:            dst = MPI_PROC_NULL;
     1972:  600:            sendaddr = NULL;
        -:  601:        }
        -:  602:        else {
     3842:  603:            sendaddr = (char *)sendbuf + dst*sendcount*sendtype_extent;
        -:  604:        }
        -:  605:
     5814:  606:        mpi_errno = MPIC_Sendrecv(sendaddr, sendcount, sendtype, dst, 
        -:  607:                                  MPIR_ALLTOALL_TAG, recvaddr,
        -:  608:                                  recvcount, recvtype, src,
        -:  609:                                  MPIR_ALLTOALL_TAG, comm, &status);
        -:  610:	/* --BEGIN ERROR HANDLING-- */
     5814:  611:        if (mpi_errno)
        -:  612:	{
    #####:  613:	    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", 0);
    #####:  614:	    return mpi_errno;
        -:  615:	}
        -:  616:	/* --END ERROR HANDLING-- */
        -:  617:    }
        -:  618:
        -:  619:    /* check if multiple threads are calling this collective function */
        -:  620:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
        -:  621:    
     1530:  622:    return (mpi_errno);
        -:  623:}
        -:  624:/* end:nested */
        -:  625:#endif
        -:  626:
        -:  627:#undef FUNCNAME
        -:  628:#define FUNCNAME MPI_Alltoall
        -:  629:
        -:  630:/*@
        -:  631:MPI_Alltoall - Sends data from all to all processes
        -:  632:
        -:  633:Input Parameters:
        -:  634:+ sendbuf - starting address of send buffer (choice) 
        -:  635:. sendcount - number of elements to send to each process (integer) 
        -:  636:. sendtype - data type of send buffer elements (handle) 
        -:  637:. recvcount - number of elements received from any process (integer) 
        -:  638:. recvtype - data type of receive buffer elements (handle) 
        -:  639:- comm - communicator (handle) 
        -:  640:
        -:  641:Output Parameter:
        -:  642:. recvbuf - address of receive buffer (choice) 
        -:  643:
        -:  644:.N ThreadSafe
        -:  645:
        -:  646:.N Fortran
        -:  647:
        -:  648:.N Errors
        -:  649:.N MPI_ERR_COMM
        -:  650:.N MPI_ERR_COUNT
        -:  651:.N MPI_ERR_TYPE
        -:  652:.N MPI_ERR_BUFFER
        -:  653:@*/
        -:  654:int MPI_Alltoall(void *sendbuf, int sendcount, MPI_Datatype sendtype, 
        -:  655:                 void *recvbuf, int recvcount, MPI_Datatype recvtype, 
        -:  656:                 MPI_Comm comm)
     5998:  657:{
        -:  658:    static const char FCNAME[] = "MPI_Alltoall";
     5998:  659:    int mpi_errno = MPI_SUCCESS;
     5998:  660:    MPID_Comm *comm_ptr = NULL;
     5998:  661:    MPIU_THREADPRIV_DECL;
        -:  662:    MPID_MPI_STATE_DECL(MPID_STATE_MPI_ALLTOALL);
        -:  663:
     5998:  664:    MPIR_ERRTEST_INITIALIZED_ORDIE();
        -:  665:    
     5998:  666:    MPIU_THREAD_CS_ENTER(ALLFUNC,);
        -:  667:    MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_ALLTOALL);
        -:  668:
        -:  669:    /* Validate parameters, especially handles needing to be converted */
        -:  670:#   ifdef HAVE_ERROR_CHECKING
        -:  671:    {
        -:  672:        MPID_BEGIN_ERROR_CHECKS;
        -:  673:        {
     5998:  674:	    MPIR_ERRTEST_COMM(comm, mpi_errno);
     5998:  675:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -:  676:	}
        -:  677:        MPID_END_ERROR_CHECKS;
        -:  678:    }
        -:  679:#   endif /* HAVE_ERROR_CHECKING */
        -:  680:
        -:  681:    /* Convert MPI object handles to object pointers */
     5996:  682:    MPID_Comm_get_ptr( comm, comm_ptr );
        -:  683:
        -:  684:    /* Validate parameters and objects (post conversion) */
        -:  685:#   ifdef HAVE_ERROR_CHECKING
        -:  686:    {
        -:  687:        MPID_BEGIN_ERROR_CHECKS;
        -:  688:        {
     5996:  689:	    MPID_Datatype *sendtype_ptr=NULL, *recvtype_ptr=NULL;
        -:  690:	    
     5996:  691:            MPID_Comm_valid_ptr( comm_ptr, mpi_errno );
     5996:  692:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
     5994:  693:	    MPIR_ERRTEST_COUNT(sendcount, mpi_errno);
     5994:  694:	    MPIR_ERRTEST_COUNT(recvcount, mpi_errno);
     5994:  695:	    MPIR_ERRTEST_DATATYPE(sendtype, "sendtype", mpi_errno);
     5994:  696:	    MPIR_ERRTEST_DATATYPE(recvtype, "recvtype", mpi_errno);
     5994:  697:            if (HANDLE_GET_KIND(sendtype) != HANDLE_KIND_BUILTIN) {
      362:  698:                MPID_Datatype_get_ptr(sendtype, sendtype_ptr);
      362:  699:                MPID_Datatype_valid_ptr( sendtype_ptr, mpi_errno );
      362:  700:                MPID_Datatype_committed_ptr( sendtype_ptr, mpi_errno );
        -:  701:            }
     5994:  702:            if (HANDLE_GET_KIND(recvtype) != HANDLE_KIND_BUILTIN) {
      362:  703:                MPID_Datatype_get_ptr(recvtype, recvtype_ptr);
      362:  704:                MPID_Datatype_valid_ptr( recvtype_ptr, mpi_errno );
      362:  705:                MPID_Datatype_committed_ptr( recvtype_ptr, mpi_errno );
        -:  706:            }
        -:  707:
     5994:  708:            if (comm_ptr->comm_kind == MPID_INTERCOMM) {
     1530:  709:                MPIR_ERRTEST_SENDBUF_INPLACE(sendbuf, sendcount, mpi_errno);
        -:  710:            }
     5994:  711:            MPIR_ERRTEST_RECVBUF_INPLACE(recvbuf, recvcount, mpi_errno);
     5994:  712:            MPIR_ERRTEST_USERBUFFER(sendbuf,sendcount,sendtype,mpi_errno);
     5994:  713:	    MPIR_ERRTEST_USERBUFFER(recvbuf,recvcount,recvtype,mpi_errno);
        -:  714:
     5994:  715:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -:  716:        }
        -:  717:        MPID_END_ERROR_CHECKS;
        -:  718:    }
        -:  719:#   endif /* HAVE_ERROR_CHECKING */
        -:  720:
        -:  721:    /* ... body of routine ...  */
        -:  722:
     5990:  723:    if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Alltoall != NULL)
        -:  724:    {
    #####:  725:	mpi_errno = comm_ptr->coll_fns->Alltoall(sendbuf, sendcount,
        -:  726:                                                 sendtype, recvbuf, recvcount,
        -:  727:                                                 recvtype, comm_ptr);
        -:  728:    }
        -:  729:    else
        -:  730:    {
     5990:  731:	MPIU_THREADPRIV_GET;
        -:  732:
     5990:  733:	MPIR_Nest_incr();
     5990:  734:        if (comm_ptr->comm_kind == MPID_INTRACOMM) 
        -:  735:            /* intracommunicator */
     4460:  736:            mpi_errno = MPIR_Alltoall(sendbuf, sendcount, sendtype,
        -:  737:                                      recvbuf, recvcount, recvtype, comm_ptr); 
        -:  738:        else {
        -:  739:            /* intercommunicator */
     1530:  740:            mpi_errno = MPIR_Alltoall_inter(sendbuf, sendcount,
        -:  741:                                            sendtype, recvbuf,
        -:  742:                                            recvcount, recvtype,
        -:  743:                                            comm_ptr); 
        -:  744:        }
     5990:  745:	MPIR_Nest_decr();
        -:  746:    }
        -:  747:
     5990:  748:    if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -:  749:
        -:  750:    /* ... end of body of routine ... */
        -:  751:    
     5998:  752:  fn_exit:
        -:  753:    MPID_MPI_COLL_FUNC_EXIT(MPID_STATE_MPI_ALLTOALL);
     5998:  754:    MPIU_THREAD_CS_EXIT(ALLFUNC,);
     5998:  755:    return mpi_errno;
        -:  756:
        8:  757:  fn_fail:
        -:  758:    /* --BEGIN ERROR HANDLING-- */
        -:  759:#   ifdef HAVE_ERROR_CHECKING
        -:  760:    {
        8:  761:	mpi_errno = MPIR_Err_create_code(
        -:  762:	    mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_alltoall",
        -:  763:	    "**mpi_alltoall %p %d %D %p %d %D %C", sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm);
        -:  764:    }
        -:  765:#   endif
        8:  766:    mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno );
        8:  767:    goto fn_exit;
        -:  768:    /* --END ERROR HANDLING-- */
        -:  769:}