-:    0:Source:/home/MPI/testing/mpich2/mpich2/src/mpi/coll/reduce.c
        -:    0:Graph:reduce.gcno
        -:    0:Data:reduce.gcda
        -:    0:Runs:4382
        -:    0:Programs:1376
        -:    1:/* -*- Mode: C; c-basic-offset:4 ; -*- */
        -:    2:/*
        -:    3: *
        -:    4: *  (C) 2001 by Argonne National Laboratory.
        -:    5: *      See COPYRIGHT in top-level directory.
        -:    6: */
        -:    7:
        -:    8:#include "mpiimpl.h"
        -:    9:
        -:   10:/* -- Begin Profiling Symbol Block for routine MPI_Reduce */
        -:   11:#if defined(HAVE_PRAGMA_WEAK)
        -:   12:#pragma weak MPI_Reduce = PMPI_Reduce
        -:   13:#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
        -:   14:#pragma _HP_SECONDARY_DEF PMPI_Reduce  MPI_Reduce
        -:   15:#elif defined(HAVE_PRAGMA_CRI_DUP)
        -:   16:#pragma _CRI duplicate MPI_Reduce as PMPI_Reduce
        -:   17:#endif
        -:   18:/* -- End Profiling Symbol Block */
        -:   19:
        -:   20:/* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
        -:   21:   the MPI routines */
        -:   22:#ifndef MPICH_MPI_FROM_PMPI
        -:   23:#undef MPI_Reduce
        -:   24:#define MPI_Reduce PMPI_Reduce
        -:   25:
        -:   26:/* This function implements a binomial tree reduce.
        -:   27:
        -:   28:   Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
        -:   29: */
        -:   30:#undef FUNCNAME
        -:   31:#define FUNCNAME MPIR_Reduce_binomial
        -:   32:#undef FCNAME
        -:   33:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:   34:static int MPIR_Reduce_binomial ( 
        -:   35:    void *sendbuf, 
        -:   36:    void *recvbuf, 
        -:   37:    int count, 
        -:   38:    MPI_Datatype datatype, 
        -:   39:    MPI_Op op, 
        -:   40:    int root, 
        -:   41:    MPID_Comm *comm_ptr )
  1237969:   42:{
  1237969:   43:    int mpi_errno = MPI_SUCCESS;
        -:   44:    MPI_Status status;
        -:   45:    int comm_size, rank, is_commutative, type_size;
        -:   46:    int mask, relrank, source, lroot;
        -:   47:    MPI_User_function *uop;
        -:   48:    MPI_Aint true_lb, true_extent, extent; 
        -:   49:    void *tmp_buf;
        -:   50:    MPID_Op *op_ptr;
        -:   51:    MPI_Comm comm;
        -:   52:#ifdef HAVE_CXX_BINDING
  1237969:   53:    int is_cxx_uop = 0;
        -:   54:#endif
  1237969:   55:    MPIU_CHKLMEM_DECL(2);
  1237969:   56:    MPIU_THREADPRIV_DECL;
        -:   57:
  1237969:   58:    if (count == 0) return MPI_SUCCESS;
        -:   59:
  1237969:   60:    comm = comm_ptr->handle;
  1237969:   61:    comm_size = comm_ptr->local_size;
  1237969:   62:    rank = comm_ptr->rank;
        -:   63:
        -:   64:    /* set op_errno to 0. stored in perthread structure */
  1237969:   65:    MPIU_THREADPRIV_GET;
  1237969:   66:    MPIU_THREADPRIV_FIELD(op_errno) = 0;
        -:   67:
        -:   68:    /* Create a temporary buffer */
        -:   69:
  1237969:   70:    mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb, &true_extent);  
  1237969:   71:    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
  1237969:   72:    MPID_Datatype_get_extent_macro(datatype, extent);
        -:   73:
  1237969:   74:    if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) {
  1226748:   75:        is_commutative = 1;
        -:   76:        /* get the function by indexing into the op table */
  1226748:   77:        uop = MPIR_Op_table[op%16 - 1];
        -:   78:    }
        -:   79:    else {
    11221:   80:        MPID_Op_get_ptr(op, op_ptr);
    11221:   81:        if (op_ptr->kind == MPID_OP_USER_NONCOMMUTE)
     5204:   82:            is_commutative = 0;
        -:   83:        else
     6017:   84:            is_commutative = 1;
        -:   85:        
        -:   86:#ifdef HAVE_CXX_BINDING            
    11221:   87:            if (op_ptr->language == MPID_LANG_CXX) {
      599:   88:                uop = (MPI_User_function *) op_ptr->function.c_function;
      599:   89:                is_cxx_uop = 1;
        -:   90:            }
        -:   91:            else
        -:   92:#endif
    10622:   93:        if ((op_ptr->language == MPID_LANG_C))
    10622:   94:            uop = (MPI_User_function *) op_ptr->function.c_function;
        -:   95:        else
    #####:   96:            uop = (MPI_User_function *) op_ptr->function.f77_function;
        -:   97:    }
        -:   98:
        -:   99:    /* I think this is the worse case, so we can avoid an assert() 
        -:  100:     * inside the for loop */
        -:  101:    /* should be buf+{this}? */
        -:  102:    MPID_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent));
        -:  103:
  1237969:  104:    MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)),
        -:  105:                        mpi_errno, "temporary buffer");
        -:  106:    /* adjust for potential negative lower bound in datatype */
  1237969:  107:    tmp_buf = (void *)((char*)tmp_buf - true_lb);
        -:  108:    
        -:  109:    /* If I'm not the root, then my recvbuf may not be valid, therefore
        -:  110:       I have to allocate a temporary one */
  1237969:  111:    if (rank != root) {
   744192:  112:        MPIU_CHKLMEM_MALLOC(recvbuf, void *, 
        -:  113:                            count*(MPIR_MAX(extent,true_extent)), 
        -:  114:                            mpi_errno, "receive buffer");
   744192:  115:        recvbuf = (void *)((char*)recvbuf - true_lb);
        -:  116:    }
        -:  117:
  1237969:  118:    if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
   794469:  119:        mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf,
        -:  120:                                   count, datatype);
   794469:  121:        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  122:    }
        -:  123:
  1237969:  124:    MPID_Datatype_get_size_macro(datatype, type_size);
        -:  125:
        -:  126:    /* This code is from MPICH-1. */
        -:  127:
        -:  128:    /* Here's the algorithm.  Relative to the root, look at the bit pattern in 
        -:  129:       my rank.  Starting from the right (lsb), if the bit is 1, send to 
        -:  130:       the node with that bit zero and exit; if the bit is 0, receive from the
        -:  131:       node with that bit set and combine (as long as that node is within the
        -:  132:       group)
        -:  133:       
        -:  134:       Note that by receiving with source selection, we guarentee that we get
        -:  135:       the same bits with the same input.  If we allowed the parent to receive 
        -:  136:       the children in any order, then timing differences could cause different
        -:  137:       results (roundoff error, over/underflows in some cases, etc).
        -:  138:       
        -:  139:       Because of the way these are ordered, if root is 0, then this is correct
        -:  140:       for both commutative and non-commutitive operations.  If root is not
        -:  141:       0, then for non-commutitive, we use a root of zero and then send
        -:  142:       the result to the root.  To see this, note that the ordering is
        -:  143:       mask = 1: (ab)(cd)(ef)(gh)            (odds send to evens)
        -:  144:       mask = 2: ((ab)(cd))((ef)(gh))        (3,6 send to 0,4)
        -:  145:       mask = 4: (((ab)(cd))((ef)(gh)))      (4 sends to 0)
        -:  146:       
        -:  147:       Comments on buffering.  
        -:  148:       If the datatype is not contiguous, we still need to pass contiguous 
        -:  149:       data to the user routine.  
        -:  150:       In this case, we should make a copy of the data in some format, 
        -:  151:       and send/operate on that.
        -:  152:       
        -:  153:       In general, we can't use MPI_PACK, because the alignment of that
        -:  154:       is rather vague, and the data may not be re-usable.  What we actually
        -:  155:       need is a "squeeze" operation that removes the skips.
        -:  156:    */
  1237969:  157:    mask    = 0x1;
  1237969:  158:    if (is_commutative) 
  1232765:  159:        lroot   = root;
        -:  160:    else
     5204:  161:        lroot   = 0;
  1237969:  162:    relrank = (rank - lroot + comm_size) % comm_size;
        -:  163:    
  3227258:  164:    while (/*(mask & relrank) == 0 && */mask < comm_size) {
        -:  165:        /* Receive */
  1495512:  166:        if ((mask & relrank) == 0) {
   751320:  167:            source = (relrank | mask);
   751320:  168:            if (source < comm_size) {
   744191:  169:                source = (source + lroot) % comm_size;
   744191:  170:                mpi_errno = MPIC_Recv (tmp_buf, count, datatype, source, 
        -:  171:                                       MPIR_REDUCE_TAG, comm, &status);
   744191:  172:                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  173:
        -:  174:                /* The sender is above us, so the received buffer must be
        -:  175:                   the second argument (in the noncommutative case). */
   744191:  176:                if (is_commutative) {
        -:  177:#ifdef HAVE_CXX_BINDING
   741109:  178:                    if (is_cxx_uop) {
      411:  179:                        (*MPIR_Process.cxx_call_op_fn)( tmp_buf, recvbuf, 
        -:  180:                                                        count, datatype, uop );
        -:  181:                    }
        -:  182:                    else 
        -:  183:#endif
   740698:  184:                        (*uop)(tmp_buf, recvbuf, &count, &datatype);
        -:  185:                }
        -:  186:                else {
        -:  187:#ifdef HAVE_CXX_BINDING
     3082:  188:                    if (is_cxx_uop) {
    #####:  189:                        (*MPIR_Process.cxx_call_op_fn)( recvbuf, tmp_buf,
        -:  190:                                                        count, datatype, uop );
        -:  191:                    }
        -:  192:                    else 
        -:  193:#endif
     3082:  194:                        (*uop)(recvbuf, tmp_buf, &count, &datatype);
     3082:  195:                    mpi_errno = MPIR_Localcopy(tmp_buf, count, datatype,
        -:  196:                                               recvbuf, count, datatype);
     3082:  197:                    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  198:                }
        -:  199:            }
        -:  200:        }
        -:  201:        else {
        -:  202:            /* I've received all that I'm going to.  Send my result to 
        -:  203:               my parent */
   744192:  204:            source = ((relrank & (~ mask)) + lroot) % comm_size;
   744192:  205:            mpi_errno  = MPIC_Send( recvbuf, count, datatype, 
        -:  206:                                    source, MPIR_REDUCE_TAG, comm );
   744192:  207:            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  208:            break;
        -:  209:        }
   751320:  210:        mask <<= 1;
        -:  211:    }
        -:  212:
  1237969:  213:    if (!is_commutative && (root != 0))
        -:  214:    {
     3084:  215:        if (rank == 0)
        -:  216:        {
      788:  217:            mpi_errno  = MPIC_Send( recvbuf, count, datatype, root, 
        -:  218:                                    MPIR_REDUCE_TAG, comm );
        -:  219:        }
     2296:  220:        else if (rank == root)
        -:  221:        {
      788:  222:            mpi_errno = MPIC_Recv ( recvbuf, count, datatype, 0, 
        -:  223:                                    MPIR_REDUCE_TAG, comm, &status);
        -:  224:        }
     3084:  225:        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  226:    }
        -:  227:
        -:  228:    /* FIXME does this need to be checked after each uop invocation for
        -:  229:       predefined operators? */
        -:  230:    /* --BEGIN ERROR HANDLING-- */
  1237969:  231:    if (MPIU_THREADPRIV_FIELD(op_errno)) {
    #####:  232:        mpi_errno = MPIU_THREADPRIV_FIELD(op_errno);
    #####:  233:        goto fn_fail;
        -:  234:    }
        -:  235:    /* --END ERROR HANDLING-- */
        -:  236:
        -:  237:fn_exit:
  1982161:  238:    MPIU_CHKLMEM_FREEALL();
  1237969:  239:    return mpi_errno;
        -:  240:fn_fail:
        -:  241:    goto fn_exit;
        -:  242:}
        -:  243:
        -:  244:/* An implementation of Rabenseifner's reduce algorithm (see
        -:  245:   http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html).
        -:  246:
        -:  247:   This algorithm implements the reduce in two steps: first a
        -:  248:   reduce-scatter, followed by a gather to the root. A
        -:  249:   recursive-halving algorithm (beginning with processes that are
        -:  250:   distance 1 apart) is used for the reduce-scatter, and a binomial tree
        -:  251:   algorithm is used for the gather. The non-power-of-two case is
        -:  252:   handled by dropping to the nearest lower power-of-two: the first
        -:  253:   few odd-numbered processes send their data to their left neighbors
        -:  254:   (rank-1), and the reduce-scatter happens among the remaining
        -:  255:   power-of-two processes. If the root is one of the excluded
        -:  256:   processes, then after the reduce-scatter, rank 0 sends its result to
        -:  257:   the root and exits; the root now acts as rank 0 in the binomial tree
        -:  258:   algorithm for gather.
        -:  259:
        -:  260:   For the power-of-two case, the cost for the reduce-scatter is 
        -:  261:   lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
        -:  262:   gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
        -:  263:   total cost is:
        -:  264:   Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
        -:  265:
        -:  266:   For the non-power-of-two case, assuming the root is not one of the
        -:  267:   odd-numbered processes that get excluded in the reduce-scatter,
        -:  268:   Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta + 
        -:  269:           n.(1+(p-1)/p).gamma
        -:  270:*/
        -:  271:#undef FUNCNAME
        -:  272:#define FUNCNAME MPIR_Reduce_redscat_gather
        -:  273:#undef FCNAME
        -:  274:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  275:static int MPIR_Reduce_redscat_gather ( 
        -:  276:    void *sendbuf, 
        -:  277:    void *recvbuf, 
        -:  278:    int count, 
        -:  279:    MPI_Datatype datatype, 
        -:  280:    MPI_Op op, 
        -:  281:    int root, 
        -:  282:    MPID_Comm *comm_ptr )
    11003:  283:{
    11003:  284:    int mpi_errno = MPI_SUCCESS;
        -:  285:    int comm_size, rank, is_commutative, type_size, pof2, rem, newrank;
    11003:  286:    int mask, *cnts, *disps, i, j, send_idx=0;
    11003:  287:    int recv_idx, last_idx=0, newdst;
        -:  288:    int dst, send_cnt, recv_cnt, newroot, newdst_tree_root, newroot_tree_root; 
        -:  289:    MPI_User_function *uop;
        -:  290:    MPI_Aint true_lb, true_extent, extent; 
        -:  291:    void *tmp_buf;
        -:  292:    MPID_Op *op_ptr;
        -:  293:    MPI_Comm comm;
        -:  294:#ifdef HAVE_CXX_BINDING
    11003:  295:    int is_cxx_uop = 0;
        -:  296:#endif
    11003:  297:    MPIU_CHKLMEM_DECL(4);
    11003:  298:    MPIU_THREADPRIV_DECL;
        -:  299:
    11003:  300:    comm = comm_ptr->handle;
    11003:  301:    comm_size = comm_ptr->local_size;
    11003:  302:    rank = comm_ptr->rank;
        -:  303:
        -:  304:    /* set op_errno to 0. stored in perthread structure */
    11003:  305:    MPIU_THREADPRIV_GET;
    11003:  306:    MPIU_THREADPRIV_FIELD(op_errno) = 0;
        -:  307:
        -:  308:    /* Create a temporary buffer */
        -:  309:
    11003:  310:    mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb, &true_extent);  
    11003:  311:    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
    11003:  312:    MPID_Datatype_get_extent_macro(datatype, extent);
        -:  313:
    11003:  314:    if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) {
    11003:  315:        is_commutative = 1;
        -:  316:        /* get the function by indexing into the op table */
    11003:  317:        uop = MPIR_Op_table[op%16 - 1];
        -:  318:    }
        -:  319:    else {
    #####:  320:        MPID_Op_get_ptr(op, op_ptr);
    #####:  321:        if (op_ptr->kind == MPID_OP_USER_NONCOMMUTE)
    #####:  322:            is_commutative = 0;
        -:  323:        else
    #####:  324:            is_commutative = 1;
        -:  325:        
        -:  326:#ifdef HAVE_CXX_BINDING            
    #####:  327:            if (op_ptr->language == MPID_LANG_CXX) {
    #####:  328:                uop = (MPI_User_function *) op_ptr->function.c_function;
    #####:  329:                is_cxx_uop = 1;
        -:  330:            }
        -:  331:            else
        -:  332:#endif
    #####:  333:        if ((op_ptr->language == MPID_LANG_C))
    #####:  334:            uop = (MPI_User_function *) op_ptr->function.c_function;
        -:  335:        else
    #####:  336:            uop = (MPI_User_function *) op_ptr->function.f77_function;
        -:  337:    }
        -:  338:
        -:  339:    /* I think this is the worse case, so we can avoid an assert() 
        -:  340:     * inside the for loop */
        -:  341:    /* should be buf+{this}? */
        -:  342:    MPID_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent));
        -:  343:
    11003:  344:    MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)),
        -:  345:                        mpi_errno, "temporary buffer");
        -:  346:    /* adjust for potential negative lower bound in datatype */
    11003:  347:    tmp_buf = (void *)((char*)tmp_buf - true_lb);
        -:  348:    
        -:  349:    /* If I'm not the root, then my recvbuf may not be valid, therefore
        -:  350:       I have to allocate a temporary one */
    11003:  351:    if (rank != root) {
     6209:  352:        MPIU_CHKLMEM_MALLOC(recvbuf, void *, 
        -:  353:                            count*(MPIR_MAX(extent,true_extent)), 
        -:  354:                            mpi_errno, "receive buffer");
     6209:  355:        recvbuf = (void *)((char*)recvbuf - true_lb);
        -:  356:    }
        -:  357:
    11003:  358:    if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
    10358:  359:        mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf,
        -:  360:                                   count, datatype);
    10358:  361:        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  362:    }
        -:  363:
    11003:  364:    MPID_Datatype_get_size_macro(datatype, type_size);
        -:  365:
        -:  366:    /* find nearest power-of-two less than or equal to comm_size */
    11003:  367:    pof2 = 1;
    11003:  368:    while (pof2 <= comm_size) pof2 <<= 1;
    11003:  369:    pof2 >>=1;
        -:  370:
    11003:  371:    rem = comm_size - pof2;
        -:  372:
        -:  373:    /* In the non-power-of-two case, all odd-numbered
        -:  374:       processes of rank < 2*rem send their data to
        -:  375:       (rank-1). These odd-numbered processes no longer
        -:  376:       participate in the algorithm until the very end. The
        -:  377:       remaining processes form a nice power-of-two. 
        -:  378:
        -:  379:       Note that in MPI_Allreduce we have the even-numbered processes
        -:  380:       send data to odd-numbered processes. That is better for
        -:  381:       non-commutative operations because it doesn't require a
        -:  382:       buffer copy. However, for MPI_Reduce, the most common case
        -:  383:       is commutative operations with root=0. Therefore we want
        -:  384:       even-numbered processes to participate the computation for
        -:  385:       the root=0 case, in order to avoid an extra send-to-root
        -:  386:       communication after the reduce-scatter. In MPI_Allreduce it
        -:  387:       doesn't matter because all processes must get the result. */
        -:  388:    
    11003:  389:    if (rank < 2*rem) {
     2062:  390:        if (rank % 2 != 0) { /* odd */
     1031:  391:            mpi_errno = MPIC_Send(recvbuf, count, 
        -:  392:                                  datatype, rank-1,
        -:  393:                                  MPIR_REDUCE_TAG, comm);
     1031:  394:            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  395:            
        -:  396:            /* temporarily set the rank to -1 so that this
        -:  397:               process does not pariticipate in recursive
        -:  398:               doubling */
     1031:  399:            newrank = -1; 
        -:  400:        }
        -:  401:        else { /* even */
     1031:  402:            mpi_errno = MPIC_Recv(tmp_buf, count, 
        -:  403:                                  datatype, rank+1,
        -:  404:                                  MPIR_REDUCE_TAG, comm,
        -:  405:                                  MPI_STATUS_IGNORE);
     1031:  406:            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  407:            
        -:  408:            /* do the reduction on received data. */
        -:  409:            /* This algorithm is used only for predefined ops
        -:  410:               and predefined ops are always commutative. */
        -:  411:#ifdef HAVE_CXX_BINDING
     1031:  412:            if (is_cxx_uop) {
    #####:  413:                (*MPIR_Process.cxx_call_op_fn)( tmp_buf, recvbuf, 
        -:  414:                                                count,
        -:  415:                                                datatype,
        -:  416:                                                uop ); 
        -:  417:            }
        -:  418:            else 
        -:  419:#endif
     1031:  420:                (*uop)(tmp_buf, recvbuf, &count, &datatype);
        -:  421:
        -:  422:            /* change the rank */
     1031:  423:            newrank = rank / 2;
        -:  424:        }
        -:  425:    }
        -:  426:    else  /* rank >= 2*rem */
     8941:  427:        newrank = rank - rem;
        -:  428:    
        -:  429:    /* for the reduce-scatter, calculate the count that
        -:  430:       each process receives and the displacement within
        -:  431:       the buffer */
        -:  432:
        -:  433:    /* We allocate these arrays on all processes, even if newrank=-1,
        -:  434:       because if root is one of the excluded processes, we will
        -:  435:       need them on the root later on below. */
    11003:  436:    MPIU_CHKLMEM_MALLOC(cnts, int *, pof2*sizeof(int), mpi_errno, "counts");
    11003:  437:    MPIU_CHKLMEM_MALLOC(disps, int *, pof2*sizeof(int), mpi_errno, "displacements");
        -:  438:    
    11003:  439:    if (newrank != -1) {
    36402:  440:        for (i=0; i<(pof2-1); i++) 
    26430:  441:            cnts[i] = count/pof2;
     9972:  442:        cnts[pof2-1] = count - (count/pof2)*(pof2-1);
        -:  443:        
     9972:  444:        disps[0] = 0;
    36402:  445:        for (i=1; i<pof2; i++)
    26430:  446:            disps[i] = disps[i-1] + cnts[i-1];
        -:  447:        
     9972:  448:        mask = 0x1;
     9972:  449:        send_idx = recv_idx = 0;
     9972:  450:        last_idx = pof2;
    34342:  451:        while (mask < pof2) {
    14398:  452:            newdst = newrank ^ mask;
        -:  453:            /* find real rank of dest */
    14398:  454:            dst = (newdst < rem) ? newdst*2 : newdst + rem;
        -:  455:            
    14398:  456:            send_cnt = recv_cnt = 0;
    14398:  457:            if (newrank < newdst) {
     7199:  458:                send_idx = recv_idx + pof2/(mask*2);
    20414:  459:                for (i=send_idx; i<last_idx; i++)
    13215:  460:                    send_cnt += cnts[i];
    20414:  461:                for (i=recv_idx; i<send_idx; i++)
    13215:  462:                    recv_cnt += cnts[i];
        -:  463:            }
        -:  464:            else {
     7199:  465:                recv_idx = send_idx + pof2/(mask*2);
    20414:  466:                for (i=send_idx; i<recv_idx; i++)
    13215:  467:                    send_cnt += cnts[i];
    20414:  468:                for (i=recv_idx; i<last_idx; i++)
    13215:  469:                    recv_cnt += cnts[i];
        -:  470:            }
        -:  471:            
        -:  472:/*                    printf("Rank %d, send_idx %d, recv_idx %d, send_cnt %d, recv_cnt %d, last_idx %d\n", newrank, send_idx, recv_idx,
        -:  473:                  send_cnt, recv_cnt, last_idx);
        -:  474:*/
        -:  475:            /* Send data from recvbuf. Recv into tmp_buf */ 
    14398:  476:            mpi_errno = MPIC_Sendrecv((char *) recvbuf +
        -:  477:                                      disps[send_idx]*extent,
        -:  478:                                      send_cnt, datatype,  
        -:  479:                                      dst, MPIR_REDUCE_TAG, 
        -:  480:                                      (char *) tmp_buf +
        -:  481:                                      disps[recv_idx]*extent,
        -:  482:                                      recv_cnt, datatype, dst,
        -:  483:                                      MPIR_REDUCE_TAG, comm,
        -:  484:                                      MPI_STATUS_IGNORE);
    14398:  485:            if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  486:            
        -:  487:            /* tmp_buf contains data received in this step.
        -:  488:               recvbuf contains data accumulated so far */
        -:  489:            
        -:  490:            /* This algorithm is used only for predefined ops
        -:  491:               and predefined ops are always commutative. */
        -:  492:#ifdef HAVE_CXX_BINDING
    14398:  493:            if (is_cxx_uop) {
    #####:  494:                (*MPIR_Process.cxx_call_op_fn)((char *) tmp_buf +
        -:  495:                                               disps[recv_idx]*extent,
        -:  496:                                               (char *) recvbuf + 
        -:  497:                                               disps[recv_idx]*extent, 
        -:  498:                                               recv_cnt, datatype, uop);
        -:  499:            }
        -:  500:            else 
        -:  501:#endif
    14398:  502:                (*uop)((char *) tmp_buf + disps[recv_idx]*extent,
        -:  503:                       (char *) recvbuf + disps[recv_idx]*extent, 
        -:  504:                       &recv_cnt, &datatype);
        -:  505:            
        -:  506:            /* update send_idx for next iteration */
    14398:  507:            send_idx = recv_idx;
    14398:  508:            mask <<= 1;
        -:  509:
        -:  510:            /* update last_idx, but not in last iteration
        -:  511:               because the value is needed in the gather
        -:  512:               step below. */
    14398:  513:            if (mask < pof2)
     6768:  514:                last_idx = recv_idx + pof2/mask;
        -:  515:        }
        -:  516:    }
        -:  517:
        -:  518:    /* now do the gather to root */
        -:  519:    
        -:  520:    /* Is root one of the processes that was excluded from the
        -:  521:       computation above? If so, send data from newrank=0 to
        -:  522:       the root and have root take on the role of newrank = 0 */ 
        -:  523:
    11003:  524:    if (root < 2*rem) {
     2242:  525:        if (root % 2 != 0) {
      875:  526:            if (rank == root) {    /* recv */
        -:  527:                /* initialize the arrays that weren't initialized */
      658:  528:                for (i=0; i<(pof2-1); i++) 
      539:  529:                    cnts[i] = count/pof2;
      119:  530:                cnts[pof2-1] = count - (count/pof2)*(pof2-1);
        -:  531:                
      119:  532:                disps[0] = 0;
      658:  533:                for (i=1; i<pof2; i++)
      539:  534:                    disps[i] = disps[i-1] + cnts[i-1];
        -:  535:                
      119:  536:                mpi_errno = MPIC_Recv(recvbuf, cnts[0], datatype,  
        -:  537:                                      0, MPIR_REDUCE_TAG, comm,
        -:  538:                                      MPI_STATUS_IGNORE);
      119:  539:                newrank = 0;
      119:  540:                send_idx = 0;
      119:  541:                last_idx = 2;
        -:  542:            }
      756:  543:            else if (newrank == 0) {  /* send */
      119:  544:                mpi_errno = MPIC_Send(recvbuf, cnts[0], datatype,  
        -:  545:                                      root, MPIR_REDUCE_TAG, comm);
      119:  546:                newrank = -1;
        -:  547:            }
      875:  548:            newroot = 0;
        -:  549:        }
     1367:  550:        else newroot = root / 2;
        -:  551:    }
        -:  552:    else
     8761:  553:        newroot = root - rem;
        -:  554:
    11003:  555:    if (newrank != -1) {
     9972:  556:        j = 0;
     9972:  557:        mask = 0x1;
    34342:  558:        while (mask < pof2) {
    14398:  559:            mask <<= 1;
    14398:  560:            j++;
        -:  561:        }
     9972:  562:        mask >>= 1;
     9972:  563:        j--;
    25122:  564:        while (mask > 0) {
    10356:  565:            newdst = newrank ^ mask;
        -:  566:
        -:  567:            /* find real rank of dest */
    10356:  568:            dst = (newdst < rem) ? newdst*2 : newdst + rem;
        -:  569:            /* if root is playing the role of newdst=0, adjust for
        -:  570:               it */
    10356:  571:            if ((newdst == 0) && (root < 2*rem) && (root % 2 != 0))
      280:  572:                dst = root;
        -:  573:            
        -:  574:            /* if the root of newdst's half of the tree is the
        -:  575:               same as the root of newroot's half of the tree, send to
        -:  576:               newdst and exit, else receive from newdst. */
        -:  577:
    10356:  578:            newdst_tree_root = newdst >> j;
    10356:  579:            newdst_tree_root <<= j;
        -:  580:            
    10356:  581:            newroot_tree_root = newroot >> j;
    10356:  582:            newroot_tree_root <<= j;
        -:  583:
    10356:  584:            send_cnt = recv_cnt = 0;
    10356:  585:            if (newrank < newdst) {
        -:  586:                /* update last_idx except on first iteration */
     5178:  587:                if (mask != pof2/2)
     1363:  588:                    last_idx = last_idx + pof2/(mask*2);
        -:  589:                
     5178:  590:                recv_idx = send_idx + pof2/(mask*2);
    12377:  591:                for (i=send_idx; i<recv_idx; i++)
     7199:  592:                    send_cnt += cnts[i];
    12377:  593:                for (i=recv_idx; i<last_idx; i++)
     7199:  594:                    recv_cnt += cnts[i];
        -:  595:            }
        -:  596:            else {
     5178:  597:                recv_idx = send_idx - pof2/(mask*2);
    12377:  598:                for (i=send_idx; i<last_idx; i++)
     7199:  599:                    send_cnt += cnts[i];
    12377:  600:                for (i=recv_idx; i<send_idx; i++)
     7199:  601:                    recv_cnt += cnts[i];
        -:  602:            }
        -:  603:            
    10356:  604:            if (newdst_tree_root == newroot_tree_root) {
        -:  605:                /* send and exit */
        -:  606:                /* printf("Rank %d, send_idx %d, send_cnt %d, last_idx %d\n", newrank, send_idx, send_cnt, last_idx);
        -:  607:                   fflush(stdout); */
        -:  608:                /* Send data from recvbuf. Recv into tmp_buf */ 
     5178:  609:                mpi_errno = MPIC_Send((char *) recvbuf +
        -:  610:                                      disps[send_idx]*extent,
        -:  611:                                      send_cnt, datatype,  
        -:  612:                                      dst, MPIR_REDUCE_TAG, 
        -:  613:                                      comm);
     5178:  614:                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  615:                break;
        -:  616:            }
        -:  617:            else {
        -:  618:                /* recv and continue */
        -:  619:                /* printf("Rank %d, recv_idx %d, recv_cnt %d, last_idx %d\n", newrank, recv_idx, recv_cnt, last_idx);
        -:  620:                   fflush(stdout); */
     5178:  621:                mpi_errno = MPIC_Recv((char *) recvbuf +
        -:  622:                                      disps[recv_idx]*extent,
        -:  623:                                      recv_cnt, datatype, dst,
        -:  624:                                      MPIR_REDUCE_TAG, comm,
        -:  625:                                      MPI_STATUS_IGNORE);
     5178:  626:                if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  627:            }
        -:  628:            
     5178:  629:            if (newrank > newdst) send_idx = recv_idx;
        -:  630:            
     5178:  631:            mask >>= 1;
     5178:  632:            j--;
        -:  633:        }
        -:  634:    }
        -:  635:
        -:  636:    /* FIXME does this need to be checked after each uop invocation for
        -:  637:       predefined operators? */
        -:  638:    /* --BEGIN ERROR HANDLING-- */
    11003:  639:    if (MPIU_THREADPRIV_FIELD(op_errno)) {
    #####:  640:        mpi_errno = MPIU_THREADPRIV_FIELD(op_errno);
    #####:  641:        goto fn_fail;
        -:  642:    }
        -:  643:    /* --END ERROR HANDLING-- */
        -:  644:
        -:  645:fn_exit:
    39218:  646:    MPIU_CHKLMEM_FREEALL();
    11003:  647:    return mpi_errno;
        -:  648:fn_fail:
        -:  649:    goto fn_exit;
        -:  650:}
        -:  651:
        -:  652:/* This is the default implementation of reduce. The algorithm is:
        -:  653:   
        -:  654:   Algorithm: MPI_Reduce
        -:  655:
        -:  656:   For long messages and for builtin ops and if count >= pof2 (where
        -:  657:   pof2 is the nearest power-of-two less than or equal to the number
        -:  658:   of processes), we use Rabenseifner's algorithm (see 
        -:  659:   http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
        -:  660:   This algorithm implements the reduce in two steps: first a
        -:  661:   reduce-scatter, followed by a gather to the root. A
        -:  662:   recursive-halving algorithm (beginning with processes that are
        -:  663:   distance 1 apart) is used for the reduce-scatter, and a binomial tree
        -:  664:   algorithm is used for the gather. The non-power-of-two case is
        -:  665:   handled by dropping to the nearest lower power-of-two: the first
        -:  666:   few odd-numbered processes send their data to their left neighbors
        -:  667:   (rank-1), and the reduce-scatter happens among the remaining
        -:  668:   power-of-two processes. If the root is one of the excluded
        -:  669:   processes, then after the reduce-scatter, rank 0 sends its result to
        -:  670:   the root and exits; the root now acts as rank 0 in the binomial tree
        -:  671:   algorithm for gather.
        -:  672:
        -:  673:   For the power-of-two case, the cost for the reduce-scatter is 
        -:  674:   lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
        -:  675:   gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
        -:  676:   total cost is:
        -:  677:   Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
        -:  678:
        -:  679:   For the non-power-of-two case, assuming the root is not one of the
        -:  680:   odd-numbered processes that get excluded in the reduce-scatter,
        -:  681:   Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta + 
        -:  682:           n.(1+(p-1)/p).gamma
        -:  683:
        -:  684:
        -:  685:   For short messages, user-defined ops, and count < pof2, we use a
        -:  686:   binomial tree algorithm for both short and long messages. 
        -:  687:
        -:  688:   Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
        -:  689:
        -:  690:
        -:  691:   We use the binomial tree algorithm in the case of user-defined ops
        -:  692:   because in this case derived datatypes are allowed, and the user
        -:  693:   could pass basic datatypes on one process and derived on another as
        -:  694:   long as the type maps are the same. Breaking up derived datatypes
        -:  695:   to do the reduce-scatter is tricky.
        -:  696:
        -:  697:   FIXME: Per the MPI-2.1 standard this case is not possible.  We
        -:  698:   should be able to use the reduce-scatter/gather approach as long as
        -:  699:   count >= pof2.  [goodell@ 2009-01-21]
        -:  700:
        -:  701:   Possible improvements: 
        -:  702:
        -:  703:   End Algorithm: MPI_Reduce
        -:  704:*/
        -:  705:
        -:  706:/* begin:nested */
        -:  707:/* not declared static because a machine-specific function may call this one 
        -:  708:   in some cases */
        -:  709:#undef FUNCNAME
        -:  710:#define FUNCNAME MPIR_Reduce
        -:  711:#undef FCNAME
        -:  712:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  713:int MPIR_Reduce ( 
        -:  714:    void *sendbuf, 
        -:  715:    void *recvbuf, 
        -:  716:    int count, 
        -:  717:    MPI_Datatype datatype, 
        -:  718:    MPI_Op op, 
        -:  719:    int root, 
        -:  720:    MPID_Comm *comm_ptr )
  1248972:  721:{
  1248972:  722:    int mpi_errno = MPI_SUCCESS;
        -:  723:    int comm_size, is_commutative, type_size, pof2;
        -:  724:    MPID_Op *op_ptr;
  1248972:  725:    MPIU_THREADPRIV_DECL;
        -:  726:
  1248972:  727:    if (count == 0) return MPI_SUCCESS;
        -:  728:
  1248972:  729:    MPIU_THREADPRIV_GET;
  1248972:  730:    MPIR_Nest_incr();
        -:  731:    
  1248972:  732:    comm_size = comm_ptr->local_size;
        -:  733:    
  1248972:  734:    if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) {
  1237751:  735:        is_commutative = 1;
        -:  736:    }
        -:  737:    else {
    11221:  738:        MPID_Op_get_ptr(op, op_ptr);
    11221:  739:        if (op_ptr->kind == MPID_OP_USER_NONCOMMUTE)
     5204:  740:            is_commutative = 0;
        -:  741:        else
     6017:  742:            is_commutative = 1;
        -:  743:    }
        -:  744:    
  1248972:  745:    MPID_Datatype_get_size_macro(datatype, type_size);
        -:  746:
        -:  747:    /* find nearest power-of-two less than or equal to comm_size */
  1248972:  748:    pof2 = 1;
  1248972:  749:    while (pof2 <= comm_size) pof2 <<= 1;
  1248972:  750:    pof2 >>=1;
        -:  751:
        -:  752:    /* check if multiple threads are calling this collective function */
        -:  753:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
        -:  754:        
  1248972:  755:    if ((count*type_size > MPIR_REDUCE_SHORT_MSG) &&
        -:  756:        (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) && (count >= pof2)) {
        -:  757:        /* do a reduce-scatter followed by gather to root. */
    11003:  758:        mpi_errno = MPIR_Reduce_redscat_gather(sendbuf, recvbuf, count, datatype, op, root, comm_ptr);
    11003:  759:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  760:    }
        -:  761:    else {
        -:  762:        /* use a binomial tree algorithm */ 
  1237969:  763:        mpi_errno = MPIR_Reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm_ptr);
  1237969:  764:        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        -:  765:    }
        -:  766:        
        -:  767:    /* check if multiple threads are calling this collective function */
        -:  768:    MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
        -:  769:
  1248972:  770:  fn_exit:
  1248972:  771:    MPIR_Nest_decr();
  1248972:  772:    return (mpi_errno);
        -:  773:
        -:  774:  fn_fail:
        -:  775:    goto fn_exit;
        -:  776:}
        -:  777:/* end:nested */
        -:  778:
        -:  779:/* A simple utility function to that calls the comm_ptr->coll_fns->Reduce
        -:  780:override if it exists or else it calls MPIR_Reduce with the same arguments. */
        -:  781:#undef FUNCNAME
        -:  782:#define FUNCNAME MPIR_Reduce_or_coll_fn
        -:  783:#undef FCNAME
        -:  784:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  785:int MPIR_Reduce_or_coll_fn(
        -:  786:    void *sendbuf, 
        -:  787:    void *recvbuf, 
        -:  788:    int count, 
        -:  789:    MPI_Datatype datatype, 
        -:  790:    MPI_Op op, 
        -:  791:    int root, 
        -:  792:    MPID_Comm *comm_ptr )
  1234883:  793:{
  1234883:  794:    int mpi_errno = MPI_SUCCESS;
        -:  795:
  1234883:  796:    if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Reduce != NULL)
        -:  797:    {
        -:  798:        /* --BEGIN USEREXTENSION-- */
    #####:  799:        mpi_errno = comm_ptr->coll_fns->Reduce(sendbuf, recvbuf, count,
        -:  800:                                               datatype, op, root, comm_ptr);
        -:  801:        /* --END USEREXTENSION-- */
        -:  802:    }
        -:  803:    else {
  1234883:  804:        mpi_errno = MPIR_Reduce(sendbuf, recvbuf, count,
        -:  805:				datatype, op, root, comm_ptr);
        -:  806:    }
        -:  807:
  1234883:  808:    return mpi_errno;
        -:  809:}
        -:  810:
        -:  811:/* begin:nested */
        -:  812:/* Needed in intercommunicator allreduce */
        -:  813:#undef FUNCNAME
        -:  814:#define FUNCNAME MPIR_Reduce_inter
        -:  815:#undef FCNAME
        -:  816:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  817:int MPIR_Reduce_inter ( 
        -:  818:    void *sendbuf, 
        -:  819:    void *recvbuf, 
        -:  820:    int count, 
        -:  821:    MPI_Datatype datatype, 
        -:  822:    MPI_Op op, 
        -:  823:    int root, 
        -:  824:    MPID_Comm *comm_ptr )
     4320:  825:{
        -:  826:/*  Intercommunicator reduce.
        -:  827:    Remote group does a local intracommunicator
        -:  828:    reduce to rank 0. Rank 0 then sends data to root.
        -:  829:
        -:  830:    Cost: (lgp+1).alpha + n.(lgp+1).beta
        -:  831:*/
        -:  832:
        -:  833:    int rank, mpi_errno;
        -:  834:    MPI_Status status;
        -:  835:    MPI_Aint true_extent, true_lb, extent;
     4320:  836:    void *tmp_buf=NULL;
     4320:  837:    MPID_Comm *newcomm_ptr = NULL;
        -:  838:    MPI_Comm comm;
     4320:  839:    MPIU_THREADPRIV_DECL;
     4320:  840:    MPIU_CHKLMEM_DECL(1);
        -:  841:
     4320:  842:    if (root == MPI_PROC_NULL) {
        -:  843:        /* local processes other than root do nothing */
     1168:  844:        return MPI_SUCCESS;
        -:  845:    }
        -:  846:
     3152:  847:    MPIU_THREADPRIV_GET;
     3152:  848:    MPIR_Nest_incr();
        -:  849:    
     3152:  850:    comm = comm_ptr->handle;
        -:  851:
     3152:  852:    if (root == MPI_ROOT) {
        -:  853:            /* root receives data from rank 0 on remote group */
        -:  854:        MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
      768:  855:        mpi_errno = MPIC_Recv(recvbuf, count, datatype, 0,
        -:  856:                              MPIR_REDUCE_TAG, comm, &status);
        -:  857:        MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); 
      768:  858:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  859:    }
        -:  860:    else {
        -:  861:        /* remote group. Rank 0 allocates temporary buffer, does
        -:  862:           local intracommunicator reduce, and then sends the data
        -:  863:           to root. */
        -:  864:        
     2384:  865:        rank = comm_ptr->rank;
        -:  866:        
     2384:  867:        if (rank == 0) {
      768:  868:            mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb,
        -:  869:                                                  &true_extent);
      768:  870:	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  871:
      768:  872:            MPID_Datatype_get_extent_macro(datatype, extent);
        -:  873:	    /* I think this is the worse case, so we can avoid an assert() 
        -:  874:	     * inside the for loop */
        -:  875:	    /* Should MPIU_CHKLMEM_MALLOC do this? */
        -:  876:	    MPID_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent));
      768:  877:	    MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer");
        -:  878:            /* adjust for potential negative lower bound in datatype */
      768:  879:            tmp_buf = (void *)((char*)tmp_buf - true_lb);
        -:  880:        }
        -:  881:        
        -:  882:        /* Get the local intracommunicator */
     2384:  883:        if (!comm_ptr->local_comm)
      149:  884:            MPIR_Setup_intercomm_localcomm( comm_ptr );
        -:  885:
     2384:  886:        newcomm_ptr = comm_ptr->local_comm;
        -:  887:        
        -:  888:        /* now do a local reduce on this intracommunicator */
     2384:  889:        mpi_errno = MPIR_Reduce(sendbuf, tmp_buf, count, datatype,
        -:  890:                                op, 0, newcomm_ptr);
     2384:  891:	if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  892:
     2384:  893:        if (rank == 0)
        -:  894:	{
        -:  895:            MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
      768:  896:            mpi_errno = MPIC_Send(tmp_buf, count, datatype, root,
        -:  897:                                  MPIR_REDUCE_TAG, comm); 
        -:  898:            MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
      768:  899:	    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
        -:  900:        }
        -:  901:    }
        -:  902:
        -:  903:  fn_exit:
      768:  904:    MPIU_CHKLMEM_FREEALL();
     3152:  905:    MPIR_Nest_decr();
     3152:  906:    return mpi_errno;
        -:  907:
        -:  908:  fn_fail:
        -:  909:    goto fn_exit;
        -:  910:}
        -:  911:/* end:nested */
        -:  912:#endif
        -:  913:
        -:  914:
        -:  915:#undef FUNCNAME
        -:  916:#define FUNCNAME MPI_Reduce
        -:  917:#undef FCNAME
        -:  918:#define FCNAME MPIU_QUOTE(FUNCNAME)
        -:  919:
        -:  920:/*@
        -:  921:
        -:  922:MPI_Reduce - Reduces values on all processes to a single value
        -:  923:
        -:  924:Input Parameters:
        -:  925:+ sendbuf - address of send buffer (choice) 
        -:  926:. count - number of elements in send buffer (integer) 
        -:  927:. datatype - data type of elements of send buffer (handle) 
        -:  928:. op - reduce operation (handle) 
        -:  929:. root - rank of root process (integer) 
        -:  930:- comm - communicator (handle) 
        -:  931:
        -:  932:Output Parameter:
        -:  933:. recvbuf - address of receive buffer (choice, 
        -:  934: significant only at 'root') 
        -:  935:
        -:  936:.N ThreadSafe
        -:  937:
        -:  938:.N Fortran
        -:  939:
        -:  940:.N collops
        -:  941:
        -:  942:.N Errors
        -:  943:.N MPI_SUCCESS
        -:  944:.N MPI_ERR_COMM
        -:  945:.N MPI_ERR_COUNT
        -:  946:.N MPI_ERR_TYPE
        -:  947:.N MPI_ERR_BUFFER
        -:  948:.N MPI_ERR_BUFFER_ALIAS
        -:  949:
        -:  950:@*/
        -:  951:int MPI_Reduce(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, 
        -:  952:	       MPI_Op op, int root, MPI_Comm comm)
    48907:  953:{
    48907:  954:    int mpi_errno = MPI_SUCCESS;
    48907:  955:    MPID_Comm *comm_ptr = NULL;
        -:  956:#if defined(USE_SMP_COLLECTIVES)
    48907:  957:    MPIU_CHKLMEM_DECL(1);
        -:  958:#endif
    48907:  959:    MPIU_THREADPRIV_DECL;
        -:  960:    MPID_MPI_STATE_DECL(MPID_STATE_MPI_REDUCE);
        -:  961:
    48907:  962:    MPIR_ERRTEST_INITIALIZED_ORDIE();
        -:  963:    
    48907:  964:    MPIU_THREAD_CS_ENTER(ALLFUNC,);
        -:  965:    MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_REDUCE);
        -:  966:
        -:  967:    /* Validate parameters, especially handles needing to be converted */
        -:  968:#   ifdef HAVE_ERROR_CHECKING
        -:  969:    {
        -:  970:        MPID_BEGIN_ERROR_CHECKS;
        -:  971:        {
    48907:  972:	    MPIR_ERRTEST_COMM(comm, mpi_errno);
    48907:  973:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -:  974:	}
        -:  975:        MPID_END_ERROR_CHECKS;
        -:  976:    }
        -:  977:#   endif /* HAVE_ERROR_CHECKING */
        -:  978:
        -:  979:    /* Convert MPI object handles to object pointers */
    48905:  980:    MPID_Comm_get_ptr( comm, comm_ptr );
        -:  981:
        -:  982:    /* Validate parameters and objects (post conversion) */
        -:  983:#   ifdef HAVE_ERROR_CHECKING
        -:  984:    {
        -:  985:        MPID_BEGIN_ERROR_CHECKS;
        -:  986:        {
    48905:  987:	    MPID_Datatype *datatype_ptr = NULL;
    48905:  988:            MPID_Op *op_ptr = NULL;
        -:  989:            int rank;
        -:  990:	    
    48905:  991:            MPID_Comm_valid_ptr( comm_ptr, mpi_errno );
    48905:  992:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -:  993:
    48903:  994:	    if (comm_ptr->comm_kind == MPID_INTRACOMM) {
    47463:  995:		MPIR_ERRTEST_INTRA_ROOT(comm_ptr, root, mpi_errno);
        -:  996:
    47463:  997:                MPIR_ERRTEST_COUNT(count, mpi_errno);
    47463:  998:                MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno);
    47463:  999:                if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
     4953: 1000:                    MPID_Datatype_get_ptr(datatype, datatype_ptr);
     4953: 1001:                    MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno );
     4953: 1002:                    MPID_Datatype_committed_ptr( datatype_ptr, mpi_errno );
        -: 1003:                }
        -: 1004:
    47463: 1005:                if (sendbuf != MPI_IN_PLACE)
    47332: 1006:                    MPIR_ERRTEST_USERBUFFER(sendbuf,count,datatype,mpi_errno);
        -: 1007:
    47463: 1008:                rank = comm_ptr->rank;
    47463: 1009:                if (rank == root) {
    21181: 1010:                    MPIR_ERRTEST_RECVBUF_INPLACE(recvbuf, count, mpi_errno);
    21181: 1011:                    MPIR_ERRTEST_USERBUFFER(recvbuf,count,datatype,mpi_errno);
    21181: 1012:                    if (count != 0 && sendbuf != MPI_IN_PLACE) {
    21050: 1013:                        MPIR_ERRTEST_ALIAS_COLL(sendbuf, recvbuf, mpi_errno);
        -: 1014:                    }
        -: 1015:                }
        -: 1016:                else
    26282: 1017:                    MPIR_ERRTEST_SENDBUF_INPLACE(sendbuf, count, mpi_errno);
        -: 1018:            }
        -: 1019:
    48903: 1020:	    if (comm_ptr->comm_kind == MPID_INTERCOMM) {
     1440: 1021:		MPIR_ERRTEST_INTER_ROOT(comm_ptr, root, mpi_errno);
        -: 1022:
     1440: 1023:                if (root == MPI_ROOT) {
      256: 1024:                    MPIR_ERRTEST_COUNT(count, mpi_errno);
      256: 1025:                    MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno);
      256: 1026:                    if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
    #####: 1027:                        MPID_Datatype_get_ptr(datatype, datatype_ptr);
    #####: 1028:                        MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno );
    #####: 1029:                        MPID_Datatype_committed_ptr( datatype_ptr, mpi_errno );
        -: 1030:                    }
      256: 1031:                    MPIR_ERRTEST_RECVBUF_INPLACE(recvbuf, count, mpi_errno);
      256: 1032:                    MPIR_ERRTEST_USERBUFFER(recvbuf,count,datatype,mpi_errno);
        -: 1033:                }
        -: 1034:                
     1184: 1035:                else if (root != MPI_PROC_NULL) {
      944: 1036:                    MPIR_ERRTEST_COUNT(count, mpi_errno);
      944: 1037:                    MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno);
      944: 1038:                    if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
    #####: 1039:                        MPID_Datatype_get_ptr(datatype, datatype_ptr);
    #####: 1040:                        MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno );
    #####: 1041:                        MPID_Datatype_committed_ptr( datatype_ptr, mpi_errno );
        -: 1042:                    }
      944: 1043:                    MPIR_ERRTEST_SENDBUF_INPLACE(sendbuf, count, mpi_errno);
      944: 1044:                    MPIR_ERRTEST_USERBUFFER(sendbuf,count,datatype,mpi_errno);
        -: 1045:                }
        -: 1046:            }
        -: 1047:
    48903: 1048:	    MPIR_ERRTEST_OP(op, mpi_errno);
        -: 1049:
    48877: 1050:            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
    48846: 1051:            if (HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) {
     8613: 1052:                MPID_Op_get_ptr(op, op_ptr);
     8613: 1053:                MPID_Op_valid_ptr( op_ptr, mpi_errno );
        -: 1054:            }
    48846: 1055:            if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) {
    40233: 1056:                mpi_errno = 
        -: 1057:                    ( * MPIR_Op_check_dtype_table[op%16 - 1] )(datatype); 
        -: 1058:            }
    48846: 1059:	    if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -: 1060:        }
        -: 1061:        MPID_END_ERROR_CHECKS;
        -: 1062:    }
        -: 1063:#   endif /* HAVE_ERROR_CHECKING */
        -: 1064:
        -: 1065:    /* ... body of routine ...  */
        -: 1066:
    48814: 1067:    if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Reduce != NULL)
        -: 1068:    {
    #####: 1069:	mpi_errno = comm_ptr->coll_fns->Reduce(sendbuf, recvbuf, count,
        -: 1070:                                               datatype, op, root, comm_ptr);
        -: 1071:    }
        -: 1072:    else
        -: 1073:    {
    48814: 1074:        if (comm_ptr->comm_kind == MPID_INTRACOMM) {
        -: 1075:            /* intracommunicator */
        -: 1076:#if defined(USE_SMP_COLLECTIVES)
        -: 1077:	    MPID_Op *op_ptr;
        -: 1078:	    int is_commutative; 
        -: 1079:
        -: 1080:	    /* is the op commutative? We do SMP optimizations only if it is. */ 
    47374: 1081:	    if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN)
    38761: 1082:		is_commutative = 1;
        -: 1083:	    else {
     8613: 1084:		MPID_Op_get_ptr(op, op_ptr);
     8613: 1085:		is_commutative = (op_ptr->kind == MPID_OP_USER_NONCOMMUTE) ? 0 : 1;
        -: 1086:	    }
        -: 1087:
    47374: 1088:            if (MPIR_Comm_is_node_aware(comm_ptr) && is_commutative) {
        -: 1089:
    35669: 1090:		void *tmp_buf = NULL;
        -: 1091:		MPI_Aint  true_lb, true_extent, extent; 
    35669: 1092:		MPIU_THREADPRIV_GET;
        -: 1093:
        -: 1094:		/* Create a temporary buffer on local roots of all nodes */
    35669: 1095:		if (comm_ptr->node_roots_comm != NULL) {
        -: 1096:
    12515: 1097:		    MPIR_Nest_incr();
    12515: 1098:		    mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb, &true_extent);  
    12515: 1099:		    MPIR_Nest_decr();
    12515: 1100:		    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }
    12515: 1101:		    MPID_Datatype_get_extent_macro(datatype, extent);
        -: 1102:
        -: 1103:		    MPID_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent));
        -: 1104:
    12515: 1105:		    MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)),
        -: 1106:					mpi_errno, "temporary buffer");
        -: 1107:		    /* adjust for potential negative lower bound in datatype */
    12515: 1108:		    tmp_buf = (void *)((char*)tmp_buf - true_lb);
        -: 1109:		}    
        -: 1110:
        -: 1111:                /* do the intranode reduce on all nodes other than the root's node */
    35669: 1112:                if (comm_ptr->node_comm != NULL &&
        -: 1113:                    MPIU_Get_intranode_rank(comm_ptr, root) == -1) { 
    #####: 1114:		    mpi_errno = MPIR_Reduce_or_coll_fn(sendbuf, tmp_buf, count, datatype,
        -: 1115:					    op, 0, comm_ptr->node_comm);
    #####: 1116:		    if (mpi_errno) goto fn_fail;
        -: 1117:                }
        -: 1118:
        -: 1119:                /* do the internode reduce to the root's node */
    35669: 1120:                if (comm_ptr->node_roots_comm != NULL) {
    12515: 1121:		    if (comm_ptr->node_roots_comm->rank != MPIU_Get_internode_rank(comm_ptr, root)) {
        -: 1122:                        /* I am not on root's node.  Use tmp_buf if we
        -: 1123:                           participated in the first reduce, otherwise use sendbuf */
    #####: 1124:                        void *buf = (comm_ptr->node_comm == NULL ? sendbuf : tmp_buf);
    #####: 1125:                        mpi_errno = MPIR_Reduce(buf, NULL, count, datatype,
        -: 1126:                                                op, MPIU_Get_internode_rank(comm_ptr, root), 
        -: 1127:                                                comm_ptr->node_roots_comm);
        -: 1128:                    }
        -: 1129:		    else { /* I am on root's node. I have not participated in the earlier reduce. */
    12515: 1130:			if (comm_ptr->rank != root) {
        -: 1131:			    /* I am not the root though. I don't have a valid recvbuf.
        -: 1132:                               Use tmp_buf as recvbuf. */
        -: 1133:
     6185: 1134:			    mpi_errno = MPIR_Reduce_or_coll_fn(sendbuf, tmp_buf, count, datatype,
        -: 1135:					    op, MPIU_Get_internode_rank(comm_ptr, root), 
        -: 1136:					    comm_ptr->node_roots_comm);
        -: 1137:
        -: 1138:			    /* point sendbuf at tmp_buf to make final intranode reduce easy */
     6185: 1139:			    sendbuf = tmp_buf;
        -: 1140:			}
        -: 1141:			else {
        -: 1142:			    /* I am the root. in_place is automatically handled. */
        -: 1143:
     6330: 1144:			    mpi_errno = MPIR_Reduce_or_coll_fn(sendbuf, recvbuf, count, datatype,
        -: 1145:					    op, MPIU_Get_internode_rank(comm_ptr, root), 
        -: 1146:					    comm_ptr->node_roots_comm);
        -: 1147:
        -: 1148:			    /* set sendbuf to MPI_IN_PLACE to make final intranode reduce easy. */
     6330: 1149:			    sendbuf = MPI_IN_PLACE;
        -: 1150:			}
        -: 1151:		    }
        -: 1152:
    12515: 1153:                    if (mpi_errno) goto fn_fail;
        -: 1154:                }
        -: 1155:
        -: 1156:                /* do the intranode reduce on the root's node */
    35669: 1157:                if (comm_ptr->node_comm != NULL &&
        -: 1158:                    MPIU_Get_intranode_rank(comm_ptr, root) != -1) { 
    35669: 1159:                    mpi_errno = MPIR_Reduce_or_coll_fn(sendbuf, recvbuf, count, datatype,
        -: 1160:					    op, MPIU_Get_intranode_rank(comm_ptr, root),
        -: 1161:					    comm_ptr->node_comm);
        -: 1162:                }
        -: 1163:
        -: 1164:            }
        -: 1165:            else {
    11705: 1166:		mpi_errno = MPIR_Reduce(sendbuf, recvbuf, count, datatype,
        -: 1167:                                    op, root, comm_ptr); 
        -: 1168:            }
        -: 1169:#else
        -: 1170:            mpi_errno = MPIR_Reduce(sendbuf, recvbuf, count, datatype,
        -: 1171:                                    op, root, comm_ptr); 
        -: 1172:#endif
        -: 1173:	}
        -: 1174:        else {
        -: 1175:            /* intercommunicator */
     1440: 1176:            mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype,
        -: 1177:	      op, root, comm_ptr); 
        -: 1178:        }
        -: 1179:    }
        -: 1180:
    48814: 1181:    if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        -: 1182:
        -: 1183:    /* ... end of body of routine ... */
        -: 1184:    
        -: 1185:  fn_exit:
        -: 1186:#if defined(USE_SMP_COLLECTIVES)
    12515: 1187:    MPIU_CHKLMEM_FREEALL();
        -: 1188:#endif
        -: 1189:    MPID_MPI_COLL_FUNC_EXIT(MPID_STATE_MPI_REDUCE);
    48907: 1190:    MPIU_THREAD_CS_EXIT(ALLFUNC,);
    48907: 1191:    return mpi_errno;
        -: 1192:
       93: 1193:  fn_fail:
        -: 1194:    /* --BEGIN ERROR HANDLING-- */
        -: 1195:#   ifdef HAVE_ERROR_CHECKING
        -: 1196:    {
       93: 1197:    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, 
        -: 1198:				     FCNAME, __LINE__, MPI_ERR_OTHER,
        -: 1199:	"**mpi_reduce", "**mpi_reduce %p %p %d %D %O %d %C", sendbuf, recvbuf, 
        -: 1200:				     count, datatype, op, root, comm);
        -: 1201:    }
        -: 1202:#   endif
       93: 1203:    mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno );
       93: 1204:    goto fn_exit;
        -: 1205:    /* --END ERROR HANDLING-- */
        -: 1206:}