% Papers using MPI
%
% This is a partial list, begun in late October, 1997.  It is intended to give
% an example of the range of applications that are known to be using
% MPI.  
% Note that some author lists are incomplete; if you have a more
% complete reference, please send it to gropp mcs.anl.gov .

@String{apr-jul="April-July"}
@String{aug-sep="August-September"}
@String{may-jun="May-June"}
@String{jul-aug="July-August"}
@String{jan-feb="Jan.-Feb."}
@String{spr="Spring"}
@String{feb-mar="Feb.-March"}
@String{nov-dec="Nov.-Dec."}
@String{win="Winter"}
@String{jun-jul="June-July"}
@String{jul-oct="July-Oct."}
@String{mar-apr="Mar.-April"}
@String{sep-oct="Sept.-Oct."}

@STRING{lncs="{L}ecture {N}otes in {C}omputer {S}cience"}
@STRING{sv="{S}pringer"}

@Article{CooFinTseYor97:mpi-groups,
  author = 	 {G. Cooperman and L. Finkelstein and M. Tselman and B. York},
  title = 	 {Constructing permutation representations for matrix groups},
  journal = 	 {Journal of Symbolic Computation},
  year = 	 1997,
  volume =	 24,
  number =	 {3--4},
  month =	 {Sept.-Oct.},
  pages =	 {471--488},
  abstract = {The theory has been successfully tested on a representation of
        the sporadic simple group Ly, discovered by Lyons (1972). With
        no a priori assumptions, we find a permutation representation of
        degree 9606125 on a conjugacy class of subgroups of order 3,
        find the order of the resulting permutation group, and verify
        simplicity A Monte Carlo variation of the algorithm was used to
        achieve better space and time efficiency. The construction of
        the permutation representation required four CPU days on a
        SPARC-server 670MP with 64 MB. The permutation representation
        was used implicitly in the sense that the group element was
        stored as a matrix, and its permutation action on a ''point''
        was determined using a pre-computed data structure. Thus,
        additional computations required little additional space. The
        algorithm has also been implemented using the MasPar MP-1 SIMD
        parallel computer and 8 SPARC-2's running under MPI. The results
        of those parallel experiments are briefly reviewed.}
}


@Article{AhuLon97:mpi-rk-scattering,
  author = 	 {V. Ahuja and L. N. Long},
  title = 	 {A parallel finite-volume {R}unge-{K}utta algorithm for
                  electromagnetic scattering},
  journal = 	 {Journal of Computational Physics},
  year = 	 1997,
  volume =	 137,
  number =	 2,
  month =	 NOV,
  pages =	 {299--320},
  abstract = {A 3D explicit finite volume algorithm has been developed to
        simulate scattering from complex geometries on parallel
        computers using structured body conformal curvilinear grids.
        Most simulations for practical 3D geometries require a large
        number of grid points for adequate spatial resolution making
        them suitable to parallel computation. The simulations have been
        carried out using a multi-block/zonal approach in the message
        passing paradigm on the SP-2. Each zone is placed on a separate
        processor and interprocessor communication is carried out using
        the Message Passing Library/Interface (MPL/MPI). Integration of
        Maxwell's equations is performed using the four-stage
        Runge-Kutta time integration method on a dual grid. This method
        of integrating on a staggered grid gives enhanced dissipative
        and dispersive characteristics. A scattered field formulation
        has been used and the Liao boundary condition is used at the
        outer nonreflecting boundary. The far zone transformation has
        also been implemented efficiently, using specialized MPL
        functions to evaluate the far zone scattering results. Results
        show extremely good comparisons for scattering from the sphere
        and the ogive with the exact solution and standard FDTD type
        algorithms. Comparisons for nonaxisymmetric targets like the
        NASA almond with experimental data has also been found to be
        extremely good.}
}

@Article{GorBi98, 
    author ="S. Gorlatch and H. Bischof", 
    title = "A Generic MPI Implementation for a Data-Parallel Skeleton: Formal
                  Derivation and Application to FFT",  
    journal = "Parallel Processing Letters", 
    volume=8, 
    number=4, 
    month=DEC, 
    year=1998, 
    pages={447--458},
    abstract = "We derive a provably correct, architecture-independent family
                  of 
                  parallel implementations for a class of data-parallel
                  algorithms, called DH (distributable homomorphisms). The
                  implementations are well-structured SPMD programs with
                  group-wise personalized all-to-all exchange, directly
                  realizable in MPI. As a case study, we systematically adjust
                  the mathematical specification of the Fast Fourier Transform
                  (FFT) to the DH format and, thereby, obtain a generic SPMD
                  implementation for FFT. The target program includes FFT
                  solutions used in practice -- the binary-exchange and the
                  2D- and 3D-transpose -- as special cases."  
} 
@Article{YevCinZhu98:mpi-groundwatersim,
  author = 	 {G. Yevi and P. Cinnella and X. Zhuang},
  title = 	 {On parallelizing a groundwater pollution simulator},
  journal = 	 {Applied Mathematics and Computation},
  year = 	 1998,
  volume =	 89,
  number =	 {1-3},
  month =	 {Jan.-Feb.},
  pages =	 {313--325},
  abstract = {Domain decomposition strategies and computational mesh
        reordering are discussed for finite difference parallel
        simulations of groundwater contaminants transport. The parallel
        performance of point iterative methods traditionally used in
        groundwater pollution modelling is studied. The algorithms were
        implemented with red-black and wavefront reordering of the
        computational mesh. A standard conservative transport equation
        defined on a two-dimensional grid with Dirichlet boundary
        conditions was used for the analysis. Completely portable
        multiple instructions multiple data (MIMD) implementations of
        the algorithm were performed using message-passing interface
        (MPI). The runtimes of the algorithms are presented as a
        function of grid refinement and number of processors, and the
        communication overhead of the parallel simulation process is
        investigated, showing that the red-black reordering technique
        yields the best performance results. The method also provides
        higher efficiency and scalability when applied to large-scale
        problems. Optimal parameters are suggested for parallel
        simulation of groundwater pollution using finite difference
        schemes.}
}
 


@Article{Ian97:mpi-reducescatter,
  author = 	 {G. Iannello},
  title = 	 {Efficient algorithms for the reduce-scatter operation in
                  {LogGP}}, 
  journal = 	 {IEEE Transactions on Parallel and Distributed Systesm},
  year = 	 1997,
  volume =	 8,
  number =	 9,
  month =	 SEP,
  pages =	 {970--982},
  abstract = {We consider the problem of efficiently performing a
        reduce-scatter operation in a message passing system.
        Reduce-scatter is the composition of an element-wise reduction
        on vectors of n elements initially held by n processors, with a
        scatter of the resulting vector among the processors. In this
        paper, we present two algorithms for the reduce-scatter
        operation, designed in LogGP. The first algorithm assumes an
        associative and commutative reduction operator and it is optimal
        in LogGP within a small constant factor. The second algorithm
        allows the reduction operator to be noncommutative, and it is
        asymptotically optimal when values to be combined are large
        arrays. To achieve these results, we developed a complete
        analysis of both algorithms in LogGP, including the derivation
        of lower bounds for the reduce-scatter operation, and the study
        of the m-item version of the problem, i.e., the case when the
        initial elements are vectors themselves. Reduce-scatter has been
        included as a collective operation in the MPI standard message
        passing library, and can be used, for instance, in parallel
        matrix-vector multiply when the matrix is decomposed by columns.
        To model a message passing system, we adopted the LogGP model,
        an extension of LogP that allows the modeling of messages of
        different length. While this choice makes the analysis somewhat
        more complex, it leads to more realistic results in the case of
        gather/scatter algorithms.}
}

 

@Article{YuaSalBalMel97:mpi-load-balancing,
  author = 	 {X. Yuan and G. Salisbury and D. Balsara and R. Melhem},
  title = 	 {A load balancing package on distributed memory systems and
                  its application to particle-particle particle-mesh ({P3M})
                  methods},  
  journal = 	 {Parallel Computing},
  year = 	 1997,
  volume =	 23,
  number =	 10,
  month =	 NOV,
  pages =	 {1525--1544},
  abstract = {We present a tool, Bisect, for balanced decomposition of spatial
        domains. In addition to applying a nested bisection algorithm to
        determine the boundaries of each subdomain, Bisect replicates a
        user specified zone along the boundaries of the subdomain in
        order to minimize future interactions between subdomains,
        Results of running the tool on the Cray T3D system using both
        shared memory operations and MPI communications are reported and
        discussed. In addition, Bisect is used in a parallel
        implementation of a particle-particle/particle-mesh (P3M)
        simulation program on the Cray T3D system. The performance of
        the P3M program with different load-balancing criteria is
        evaluated and compared. The results show that the use of the
        Bisect package balances the load efficiently and minimizes
        communication on the T3D massively parallel system.}
}

 
@Article{FosKohKriCho97:mpi-task-parallel,
  author = 	 {I. Foster and D. R. Kohr and R. Krishnaiyer and
                  A. Choudhary}, 
  title = 	 {A library-based approach to task parallelism in a
                  data-parallel language},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 1997,
  volume =	 45,
  number =	 2,
  month =	 SEP,
  pages =	 {148--158},
  abstract = {Pure data-parallel languages such as High Performance Fortran
        version 1 (HPF) do not allow efficient expression of mixed
        task/data-parallel computations or the coupling of separately
        compiled data-parallel modules, In this paper, we show how these
        common parallel program structures can be represented, with only
        minor extensions to the HPF model, by using a coordination
        library based on the Message Passing Interface (MPI). This
        library allows data-parallel tasks to Exchange distributed data
        structures using falls to simple communication functions. We
        present microbenchmark results that characterize the performance
        of this library and that quantify the impact of optimizations
        that allow reuse of communication schedules in common
        situations, In addition, results from two-dimensional FFT,
        convolution, and multiblock programs demonstrate that the
        HPF/MPI library can provide performance superior to that of pore
        HPF, We conclude that this synergistic combination of two
        parallel programming standards represents a useful approach to
        task parallelism in a data-parallel framework, increasing the
        range of problems addressable in HPF without requiring complex
        compiler technology.}
}


@Article{BruGehRei97:mpi-resource-mgmt,
  author = 	 {M. Brune and J. Gehring and A. Reinefeld},
  title = 	 {Heterogeneous message passing and a link to resource
                  management}, 
  journal = 	 {Journal of Supercomputing},
  year = 	 1997,
  volume =	 11,
  number =	 4,
  pages =	 {355--369},
  abstract = {PLUS is a light-weight, extensible and efficient communication
        interface. with only four commands, PLUS is almost transparent
        to the application code. Our current implementation supports
        inter-process communication between PVM, MPI and PARIX, but it
        can be easily extended to other vendor-specific message passing
        Libraries. As PLUS has been designed for wide area networks,
        much effort has been spent on portability and on optimizing the
        communication speed across internet and also intranet links.}
}
 

@Article{Hor97,
  author = 	 {K. Hori},
  title = 	 {Supercomputer {SX-4} multinode system},
  journal = 	 {NEC Research \& Development},
  year = 	 1997,
  volume =	 38,
  number =	 4,
  pages =	 {461--473},
  abstract = {The NEC supercomputer SX-4 multinode system series consists of
        two models, one being HIPPI (High Performance Parallel
        Interface)-connected model and the other IXS (Internode Crossbar
        Switch)-connected model. With the IXS, a proprietary high-speed
        crossbar switch, the HPC (High Performance Computing) up to 1
        TFLOPS (Tera Flops) has been enabled by providing the most
        comprehensive environment for distributed parallel processing.
        This also means the world's first implementation of a clustered
        parallel processing. In this paper, we describe the functions of
        IXS hardware, the new operating system functions, MPI/SX the MPI
        (Message Passing Interface) processor and NQS/MPI which supports
        the close cooperation between NQS (Network Queuing System) batch
        processing system and MPI.}
}


@Article{Fac97:mpi-load-balance,
  author = 	 {A. Fachat and K. H. Hoffmann},
  title = 	 {Implementation of ensemble-based simulated annealing with
                  dynamic load balancing under {MPI}}, 
  journal = 	 {Computer Physics Communications},
  year = 	 1997,
  volume =	 107,
  number =	 {1--3},
  month =	 DEC,
  pages =	 {49--53},
  abstract = {This paper describes an implementation of Ensemble Based
        Simulated Annealing (EBSA) with dynamic load balancing. It is
        running under the MPI Message Passing Library allowing parallel
        execution on various types of computers. The load balancing is
        used to get maximum use of the available processing power, even
        on heterogeneous workstation clusters where the machines differ
        a lot in computing power.}
}


@Article{BarHau98:mpi-app,
  author = 	 {E. Baron and P. H. Hauschildt},
  title = 	 {Parallel implementation of the phoenix generalized stellar
                  atmosphere program. {II}. Wavelength parallelization},
  journal = 	 {Astrophysical Journal},
  year = 	 1998,
  volume =	 495,
  number =	 {1 part 1},
  month =	 MAR,
  pages =	 {370--376},
  abstract = {We describe an important addition to the parallel implementation
        of our generalized nonlocal thermodynamic equilibrium (NLTE)
        stellar atmosphere and radiative transfer computer program
        PHOENIX. In a previous paper in this series we described data
        and task parallel algorithms we have developed for radiative
        transfer, spectral line opacity, and NLTE opacity and rate
        calculations. These algorithms divided the work spatially or by
        spectral lines, that is, distributing the radial zones,
        individual spectral lines, or characteristic rays among
        different processors and employ, in addition, task parallelism
        for logically independent functions (such as atomic and
        molecular line opacities). For finite, monotonic velocity
        fields, the radiative transfer equation is an initial value
        problem in wavelength, and hence each wavelength point depends
        upon the previous one. However, for sophisticated NLTE models of
        both static and moving atmospheres needed to accurately
        describe, e.g., novae and supernovae, the number of wavelength
        points is very large (200,000-300,000) and hence parallelization
        over wavelength can lead both to considerable speedup in
        calculation time and the ability to make use of the aggregate
        memory available on massively parallel supercomputers. Here, we
        describe an implementation of a pipelined design for the
        wavelength parallelization of PHOENIX, where the necessary data
        from the processor working on a previous wavelength point is
        sent to the processor working on the succeeding wavelength point
        as soon as it is known. Our implementation uses a MIMD design
        based on a relatively small number of standard message passing
        interface (MPI) library calls and is fully portable between
        serial and parallel computers.}
}

@Article{Yas98:complex-flows,
  author = 	 {O. Yasar},
  title = 	 {A scalable model for complex flows},
  journal = 	 {Computers and Mathematics with Applications},
  year = 	 1998,
  volume =	 35,
  number =	 7,
  month =	 APR,
  pages =	 {117-128},
  abstract = {We describe a scalable parallel algorithm for numerical
        simulations of turbulent, radiative, magnetized, and reactive
        fluid + particle systems on message-passing distributed-memory
        computers. Accurate simulation of such complex flows has
        applications in engine combustion, industrial pulverized coal
        burners, astrophysics, inertial confinement fusion, nuclear
        systems, and many other strategically and economically important
        areas. Our algorithm has been developed based on a widely-used
        combustion code KIVA-3, a plasma and radiation hydrodynamics
        code R-MHD, a classical particle dynamics code CMDT, and a
        discrete ordinates particle transport code TORT. The development
        is being done on the Intel Paragon with PVM and MPI extensions.
        We report high levels of parallel efficiency and scalability (up
        to 1024 nodes) for a baseline engine test case, using our
        current message-passing reactive and turbulent flow code. The
        three-dimensional extension of radiation magnetohydrodynamics
        component is still being worked at and we hope to report further
        progress in the future.}
}


@Article{LepSchHei98:reactive-flow,
  author = 	 {J. Lepper and U. Schnell and K. R. G. Hein},
  title = 	 {Parallelization of a simulation code for reactive flows on
                  the Intel Paragon},
  journal = 	 {Computers and Mathematics with Applications},
  year = 	 1998,
  volume =	 35,
  number =	 7,
  month =	 APR,
  pages =	 {101-109},
  abstract = {The paper shows the implementation of a 3D simulation code for
        turbulent how and combustion processes in full-scale utility
        boilers on an Intel Paragon XP/S computer. For the portable
        parallelization, an explicit approach is chosen using a domain
        decomposition method for the static subdivision of the numerical
        grid together with the SPMD programming model. The measured
        speedup for the presented case using a coarse grid is good,
        although some numerical requirements restrict the implemented
        message passing to strongly synchronized communication. On the
        Paragon, the NX message passing library is used for the
        computations. Furthermore, MPI and PVM are applied and their
        pros and cons on this computer are described. In addition to the
        basic message passing techniques for local and global
        communication, other possibilities are investigated. Besides the
        applicability of the vectorizing capability of the compiler, the
        influence of the I/O performance during computations is
        demonstrated. The scalability of the parallel application is
        presented for a refined discretization.}
}


@Article{Gor98:fft,
  author = 	 {S. Gorlatch},
  title = 	 {Programming with divide-and-conquer skeletons: A case study
                  of {FFT}}, 
  journal = 	 {Journal of Supercomputing},
  year = 	 1998,
  volume =	 12,
  number =	 {1-2},
  pages =	 {85-97},
}

@Article{Hio98:qcd,
  author = 	 {S. Hioki},
  title = 	 {{QCDMPI}---pure {QCD} Monte Carlo Simulation code with MPI},
  journal = 	 {Nuclear Physics B-Proceedings Supplements},
  year = 	 1998,
  volume =	 63,
  month =	 APR,
  pages =	 {1000--1002},
  abstract = {In this paper, outline of QCDMPI is reported. Comparison of the
        performances on several parallel machines; AP1000, AP1000+,
        AP3000, Cenju-3, Paragon, SR2201 and Workstation Cluster, is
        also reported.}
}
 

@Article{Han98:mpi-eval,
  author = 	 {P. B. Hansen},
  title = 	 {An evaluation of the message-passing interface},
  journal = 	 {ACM Sigplan Notices},
  year = 	 1998,
  volume =	 33,
  number =	 3,
  month =	 MAR,
  pages =	 {65--72},
  abstract = {The Message-Passing Interface (MPI) is evaluated by rewriting
        message parallel programs for Householder reduction, matrix
        multiplication, and successive overrelaxation. The author
        concludes that MPI is a practical programming tool. It does,
        however, lack the elegance and security that can only be
        achieved by a parallel programming language.}
}
 

@Article{Iss98:cfd-precond,
  author = 	 {E. Issman},
  title = 	 {Non-overlapping preconditioners for a parallel implicit
                  Navier-Stokes solver},
  journal = 	 {Future Generation Computer Systems},
  year = 	 1998,
  volume =	 13,
  number =	 {4--5},
  month =	 MAR,
  pages =	 {303-313},
  abstract = {Parallel implicit iterative solution techniques are considered
        for application to a compressible hypersonic Navier-Stokes
        solver on unstructured meshes. The construction of parallel
        preconditioners with quasi-optimal convergence properties with
        respect to their serial counterpart is a key issue in the design
        of modern parallel implicit schemes, Two types of
        non-overlapping preconditioners are presented and compared. The
        first one is an additive Schwarz preconditioner requiring
        overlapping of the mesh and the second one is based on a Schur
        complement formulation. Both are using incomplete LU
        factorisation at the subdomain level but scale differently.
        Results are presented for computations on the Cray T3D under the
        message passing interface MPI. }
} 


@Article{Bar98:migration,
  author = 	 {A. Barak},
  title = 	 {The MOSIX multicomputer operating system for high
                  performance cluster computing},
  journal = 	 {Future Generation Computer Systems},
  year = 	 1998,
  volume =	 13,
  number =	 {4--5},
  month =	 MAR,
  pages =	 {361-372},
  abstract = {The scalable computing cluster at Hebrew University consists of
        88 Pentium II and Pentium-Pro servers that are connected by fast
        Ethernet and the Myrinet LANs. It is running the MOSIX operating
        system, an enhancement of BSD/OS with algorithms for adaptive
        resource sharing, that are geared for performance scalability in
        a scalable computing cluster. These algorithms use a preemptive
        process migration for load-balancing and memory ushering, in
        order to create a convenient multiuser time-sharing execution
        environment for HPC, particularly for applications that are
        written in PVM or MPI. This paper begins with a brief overview
        of MOSIX and its resource sharing algorithms. Then the paper
        presents the performance of these algorithms as well as the
        performance of several large-scale, parallel applications.}
}
 

@Article{Rei97:interop,
  author = 	 {A. Reinefeld},
  title = 	 {Communicating across parallel message-passing environments},
  journal = 	 {Journal of Systems Architecture},
  year = 	 1997,
  volume =	 44,
  number =	 {3--4},
  month =	 DEC,
  pages =	 {261--272},
  abstract = {We present a small, extensible interface for the transparent
        communication between vendor-specific and standard
        message-passing environments. With only four new commands,
        existing parallel applications can make use of our PLUS
        communication interface, thereby allowing inter-process
        communication with other programming environments. Much effort
        has been spent in optimizing the communication speed across
        Internet and Intranet links. Our current implementation supports
        process communication between PVM, MPI, and PARIX. With only
        marginal additional effort, the interface can be adapted to
        support other message-passing environments as well.}
}
 
@Article{hom97:mpi-maxcup,
  author = 	 {S. Homer},
  title = 	 {Design and performance of parallel and distributed
                  approximation algorithms for maxcut}, 
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 1997,
  volume =	 41,
  number =	 1,
  pages =	 {48--61},
  month =	 OCT,
  abstract = {
        We develop and experiment with a new parallel algorithm to
        approximate the maximum weight cut in a weighted undirected
        graph, Our implementation starts with the recent (serial)
        algorithm of Goemans and Williamson for this problem, We
        consider several different versions of this algorithm, varying
        the interior-point part of the algorithm in order to optimize
        the parallel efficiency of our method, Our work aims for an
        efficient, practical formulation of the algorithm with
        close-to-optimal parallelization. We analyze our parallel
        algorithm in the LogP model and predict linear speedup for a
        wide range of the parameters, We have implemented the algorithm
        using the message passing interface (MPI) and run it on several
        parallel machines. In particular, we present performance
        measurements on the IBM SP2, the Connection Machine CM5, and a
        cluster of workstations, We observe that the measured speedups
        are predicted well by our analysis in the LogP model, Finally,
        we test our implementation on several large graphs (up to 13,000
        vertices), particularly on large instances of the Ising model.}
}
 
@Article{War:mpi-cluster,
  author = 	 {T. M. Warschko},
  title = 	 {ParaStation: Efficient parallel computing by
                  clustering workstations: Design and evaluation},
  journal = 	 {Journal of Systems Architecture},
  year = 	 1997,
  volume =	 44,
  number =	 {3--4},
  pages =	 {241--260},
  month =	 DEC,
  abstract = {ParaStation is a communications fabric for connecting
        off-the-shelf workstations into a supercomputer. The fabric
        employs technology used in massively parallel machines and
        scales up to 4096 nodes, ParaStation's user-level message
        passing software preserves the low latency of the fabric by
        taking the operating system out of the communication path, while
        still providing full protection in a multiprogramming
        environment. The programming interface presented by ParaStation
        consists of a UNIX socket emulation and widely used parallel
        programming environments such as PVM, P4, and MPI.
        Implementations of ParaStation using various platforms, such as
        Digitals AlphaGeneration workstations and Linux PCs, achieve
        end-to-end (process-to-process) latencies as low as 2 mu s and a
        sustained bandwidth of up to 15 Mbyte/s per channel, even with
        small packets. Benchmarks using PVM on ParaStation demonstrate
        real application performance of 1 GFLOP on an 8-node cluster.}
}
 
@Article{War98:mpi-cluster,
  author = 	 {T. M. Warschko},
  title = 	 {The {ParaStation} project: Using workstations as
                  building blocks for parallel computing},
  journal = 	 {Information Sciences},
  year = 	 1998,
  volume =	 106,
  number =	 {3--4},
  pages =	 {277--292},
  month =	 MAY,
  abstract = {The ParaStation communication fabric provides a high-speed
        communication network with user-level access to enable efficient
        parallel computing on workstation clusters. The architecture,
        implemented on off-the-shelf workstations coupled by the
        ParaStation communication hardware, removes the kernel and
        common network protocols from the communication path while still
        providing full protection in a multiuser, multiprogramming
        environment. The programming interface presented by ParaStation
        consists of a UNIX socket emulation and widely used parallel
        programming environments such as PVM, P4, and MPI. This allows
        porting a wide range of client/server and parallel applications
        to the ParaStation architecture. Implementations of ParaStation
        using various platforms, such as Digital's AlphaGeneration
        workstations and Linux PCs, achieve end-to-end
        (process-to-process) latencies as low as 2 mu s and a sustained
        bandwidth of up to 15 Mbyte/s per channel with small packets.
        Benchmarks using PVM on ParaStation demonstrate real application
        performance of 1 GFLOP on an 8-node cluster. }
}

@Article{Dan98:mpi-scheduling,
  author = 	 {M. A. R. Dantas},
  title = 	 {Efficient scheduling of {MPI} applications on
                  networks of workstations}, 
  journal = 	 {Future Generation Computer Systems},
  year = 	 1998,
  volume =	 13,
  number =	 6,
  pages =	 {489--499},
  month =	 MAY,
  abstract = {The availability of a large number of workstations connected
        through a network can represent an attractive option for
        high-performance computing for many applications. The
        message-passing interface (MPI) software environment is an
        effort from many organisations to define a de facto
        message-passing standard. In other words, the original
        specification was not designed as a comprehensive parallel
        programming environment and some researchers agree that the
        standard should be preserved as simple and clean as possible.
        Nevertheless, a software environment such as MPI should have
        somehow a scheduling mechanism for the effective submission of
        parallel applications on network of workstations. This paper
        presents an alternative lightweight approach called
        Selective-MPI (S-MPI), which was designed to enhance the
        efficiency of the scheduling of applications on an MPI
        implementation environment.}
}
 
@Article{Cou98:mpi-c++,
  author = 	 {O. Coulaud},
  title = 	 {Para++: A high level {C++} interface for message passing},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 1998,
  volume =	 51,
  number =	 1,
  pages =	 {46--62},
  month =	 MAY,
  abstract = {This paper describes a high level C++ interface for message
        passing applications. Our interface is built on top of PVM and
        MPI. The two main contributions are to allow a quicker design of
        parallel applications without any important drop of
        performances. We introduce two levels of tasks and use C++
        streams for communications. We also present a performance study
        over both PVM and MPI to show the overhead of our
        implementation. Finally, we detail two applications based on the
        heat equation to explain how lPara++ call be used for SPMD and
        MPMD applications.}
}
 

@Article{Sal98:mpi-genetic,
  author = 	 {A. Salhi},
  title = 	 {Parallel implementation of a genetic-programming
                  based tool for symbolic regression}, 
  journal = 	 {Information Processing Letters},
  year = 	 1998,
  volume =	 66,
  number =	 6,
  pages =	 {299-307},
  month =	 JUN,
  abstract = {We report on a parallel implementation of a tool for symbolic
        regression, the algorithmic mechanism of which is based on
        genetic programming, and communication is handled using MPI. The
        implementation relies on a random islands model (RIM), which
        combines both the conventional islands model where migration of
        individuals between islands occurs periodically and niching
        where no migration takes place. The system was designed so that
        the algorithm is synergistic with parallel/distributed
        architectures, and works to make use of processor time and
        minimum use of network bandwidth without complicating the
        sequential algorithm significantly. Results on an IBM SP2 are
        included. }
}
 
@Article{Har98:mpi-application,
  author = 	 {H. K. Harbury},
  title = 	 {Parallel computation for electronic waves in quantum
                  corrals}, 
  journal = 	 {VLSI Design},
  year = 	 1998,
  volume =	 6,
  number =	 {1--4},
  pages =	 {57--51},
  abstract = {Recent scanning tunneling microscopy (STM) studies on the (111)
        faces of noble metals have directly imaged electronic
        surface-confined states and dramatic standing-wave patterns have
        been observed [1,2]. We solve for the local density of
        electronic states in these ''leaky'' quantum corral confinement
        structures using a coherent elastic scattering theory. We seek
        solutions of the two-dimensional Schrodinger equation compatible
        with non-reflecting boundary conditions which asymptotically
        satisfy the Sommerfeld radiation condition [11,14]. The large
        matrices generated by the discretization of realistic quantum
        corral structures require the use of sparse matrix methods. In
        addition, a parallel finite element solution was undertaken
        using the message passing interface standard (MPI) and the
        Portable, Extensible, Toolkit for Scientific Computation (PETSc)
        [5] for an efficient computational solution on both distributed
        and shared memory architectures. Our calculations reveal
        excellent agreement with the reported experimental dI/dV STM
        data.}
}

@Article{Jak98:mpi-application,
  author = 	 {U. Jakobus},
  title = 	 {Analysis of electromagnetic scattering problems by
                  an iterative combination of {MoM} with {GMT} using {MPI}
                  for the communication}, 
  journal = 	 {Microwave and Optical Technology Letters},
  year = 	 1998,
  volume =	 19,
  number =	 1,
  pages =	 {1--4},
  month =	 SEP,
  abstract = {A hybrid method is proposed combining the method of moments
        (MoM) with the generalized multipole technique (GMT) for the
        efficient analysis of electromagnetic radiation and scattering
        problems involving metallic as well as dielectric bodies. An
        iterative coupling scheme is applied so that only some small
        changes to the MoM and GMT formulations are required, making it
        very attractive for the combination of already existing MoM and
        GMT codes. During the iteration, the MoM and GMT processes are
        executed in parallel, and communication is done using the
        message-passing interface (MPI).}
}

@Article{Ril98:mpi-application,
  author = 	 {C. J. Riley},
  title = 	 {Distributed-memory computing with the {L}angley
                  {A}erothermodynamic {U}pwind {R}elaxation
                  {A}lgorithm {(LAURA)}}, 
  journal = 	 {Advances in Engineering Software},
  year = 	 1998,
  volume =	 29,
  number =	 {3--6},
  pages =	 {317--324},
  month =	 APR-JUL,
  abstract = {The Langley Aerothermodynamic Upwind Relaxation Algorithm
        (LAURA), a Navier-Stokes solver, has been modified for use in a
        parallel, distributed-memory environment using the
        Message-Passing Interface (MPI) standard. A standard domain
        decomposition strategy is used in which the computational domain
        is divided into subdomains with each subdomain assigned to a
        processor. Performance is examined on dedicated parallel
        machines and a network of desktop workstations. The effect of
        domain decomposition and frequency of boundary updates on
        performance and convergence is also examined for several
        realistic configurations and conditions typical of large-scale
        computational fluid dynamic analysis.}
}
 

@Article{Wan98:mpi-application,
  author = 	 {P. Wang},
  title = 	 {Massively parallel finite volume computation of
                  three-dimensional thermal convective flows},
  journal = 	 {Advances in Engineering Software},
  year = 	 1998,
  volume =	 29,
  number =	 {3--6},
  pages =	 {307--315},
  month =	 APR-JUL,
  abstract = {A parallel implementation of the finite volume method for
        three-dimensional, time-dependent, thermal convective flows is
        presented. The algebraic equations resulting from the finite
        volume discretization are solved by a parallel multigrid method.
        A flexible parallel code has been implemented on
        distributed-memory systems, by using domain decomposition
        techniques and the MPI communication software. The code uses
        one-, two- or three-dimensional partition according to different
        geometries. It currently runs on the Intel Paragon, the Cray
        T3D, T3E, the IBM SP2 and the Beowulf systems, which can be
        ported easily to other parallel systems. A comparison of the
        wallclock time of the code between these systems is made, and
        code performances with respect to different numbers of
        processors are presented.}
}
 
@Article{Dan98:mpi-application,
  author = 	 {K. T. Danielson},
  title = 	 {Nonlinear dynamic finite element analysis on
                  parallel computers using {FORTRAN} 90 and {MPI}},
  journal = 	 {Advances in Engineering Software},
  year = 	 1998,
  volume =	 29,
  number =	 {3--6},
  pages =	 {179--186},
  month =	 APR-JUL,
  abstract = {A nonlinear explicit dynamic finite element code for use on
        scalable computers is presented. The code was written entirely
        in FORTRAN 90, but uses MPI for all interprocessor
        communication. Although MPI is not formally a standard for
        FORTRAN 90, the code runs properly in parallel on CRAY T3E, IBM
        SP, and SGI ORIGIN 2000 computing systems. Issues regarding the
        installation, portability, and effectiveness of the FORTRAN
        90-MPI combination on these machines are discussed. An algorithm
        that overlaps message passing and computations of the explicit
        finite element equations is also presented and evaluated.
        Several large-scale ground-shock analyses demonstrate the
        varying combined importance of load balance and interprocessor
        communication among the different computing platforms. The
        analyses were performed on only a few to hundreds of processors
        with excellent speedup and scalability.}
}
 

@Article{Vat98:mpi-application,
  author = 	 {V. N. Vatsa},
  title = 	 {Viscous pow computations for complex geometries on
                  parallel computers}, 
  journal = 	 {Advances in Engineering Software},
  year = 	 1998,
  volume =	 29,
  number =	 {3--6},
  month =	 APR-JUL,
  abstract = {A widely used computational fluid dynamics (CFD) code known as
        TLNS3D, which was developed for large, shared-memory computers,
        is ported to a distributed computing environment. An engineering
        approach is used here to parallelize this code so that minimal
        deviation from the original (non-parallel) code is incurred. A
        natural partitioning along grid blocks is adopted in which one
        or more blocks are distributed to each of the available
        processors. An automatic, static load-balancing strategy is
        employed for equitable distribution of computational work to
        specified processors. The message passing interface (MPI)
        protocols are incorporated for data communication. Both
        synchronous and asynchronous communication modes have been
        incorporated. As the number of processors is increased, the
        asynchronous communication mode shows much better scalability
        and clearly outperforms the synchronous mode of
                  communication.}
}

@Article{Riv98:mpi-application,
  author = 	 {W. RiveraGallego},
  title = 	 {A genetic algorithm for circulant Euclidean distance
                  matrices}, 
  journal = 	 {Applied Mathematics and Computation},
  year = 	 1998,
  volume =	 97,
  number =	 {2--3},
  pages =	 {197--208},
  month =	 DEC,
  abstract = {This paper presents a fast genetic algorithm to determine
        three-dimensional configurations of points that generate
        circulant Euclidean Distance Matrices (EDMs). A parallel
        implementation is possible by using the message passing
        interface (MPI) standard. In addition, theoretical results about
        the polyhedral structure of both the cone of circulant symmetric
        positive semidefinite matrices and the cone of circulant EDMs
        are introduced.}
}

@Article{Ada98:mpi-application,
  author = 	 {P. Adamidis},
  title = 	 {Steel strip production --- a pilot application for
                  coupled simulation with several calculation systems},
  journal = 	 {Journal of Materials Processing Technology},
  year = 	 1998,
  volume =	 {80-1},
  pages =	 {330--336},
  month =	 AUG-SEP,
  abstract = {For the simulation of technological and natural processes in
        specific application domains, efficient calculation software
        solving differential equation systems on grid-based
        computational models is available, especially in the area of
        computer-aided engineering (CAE). To handle a so-called
        'multiphysics' problem, for example the fluid flow and metal
        forming process in a twin-roll casting arrangement for steel
        strip production, several calculation systems usually have to be
        employed in a high-performance computing environment, e.g. on
        parallel computers. The GRISSLi Coupling Interface is a software
        tool facilitating the coupled computation based on the message
        passing standard MPI.}
}  


@Article{Dow98:mpi-implementation,
  author = 	 {P. W. Dowd},
  title = 	 {{BLAST}: broadband lightweight {ATM} secure
                  transport for high-performance distributed computing},
  journal = 	 {Computer Communications},
  year = 	 1998,
  volume =	 21,
  number =	 12,
  pages =	 {1040--1057},
  month =	 AUG,
  abstract = {This paper investigates the use of ATM for cluster-based
        computing. The need for a native ATM API is discussed as well as
        the performance of message passing libraries (MPL) that are
        written to use such an API to exploit the advantages of a
        high-speed network for cluster-based computing. The MPLs offer a
        standard interface, such as PVM or MPI, and interoperate with
        existing TCP/IP- and UDP/IP-based versions in addition to the
        ATM API environment. The interoperability extensions made to two
        MPLs, MPI and Prowess, which allow a hybrid environment of both
        ATM and TCP-based legacy network technology will be described.
        Shared object space (SOS), an extension to the MPLs, is
        described that helps support the geographically distributed
        computing (GDC) environment through latency hiding. It allows a
        user to develop applications in a shared memory type of
        environment. The native ATM API which supports cluster-based
        computing is described in this paper. This API provides a
        reliable transport interface to the MPL which has been optimized
        for an ATM environment. The transport protocol is a low-state
        design that optimizes the performance based on the available
        bandwidth, buffer constraints, propagation delay characteristics
        and security requirements of a particular connection.}
}
 
@Article{Kac98:mpi-tool,
  author = 	 {P. Kacsuk},
  title = 	 {{GRADE}: A graphical programming environment for
                  multicomputers}, 
  journal = 	 {Computers and Artificial Intelligence},
  year = 	 1998,
  volume =	 17,
  number =	 5,
  pages =	 {417--427},
  abstract = {To provide high-level graphical support for developing message
        passing programs, an integrated programming environment (GRADE)
        is being developed. GRADE currently provides tools to construct,
        execute, debug, monitor and visualise message-passing based
        parallel programs. GRADE offers the programmer an integrated
        graphical user interface during the whole life-cycle of program
        development and provides high-level graphical programming
        abstraction mechanisms to construct parallel applications. The
        current version of GRADE can generate C+PVM code but there is no
        theoretical obstacle to extend it for supporting MPI [9] and
        FORTRAN. Those new features of the GRADE graphical environment
        are described in the paper that enhanced GRADE towards a
        professional parallel programming environment.}
}
 
@Article{Ras98:mpi-application,
  author = 	 {J. Rasch},
  title = 	 {6-dimensional integrals and supercomputers},
  journal = 	 {Computer Physics Communications},
  year = 	 1998,
  volume =	 114,
  number =	 {1--3},
  pages =	 {378--384},
  month =	 NOV,
  abstract = {Recently, a numerical method has been developed for the
        evaluation of general 6-dimensional integrals (6DIME), which has
        been successfully applied to the study of (e,2e) and (gamma,2e)
        processes. Details of the parallelization of that code are given
        using MPI and the scaling behaviour with respect to the number
        of nodes is presented. Almost full load balancing is
        obtained.The method is extended to include two centre scattering
        problems.}
}
 
@Article{Chu98:mpi-balancing,
  author = 	 {Y. Chung},
  title = 	 {An asynchronous algorithm for balancing
                  unpredictable workload on distributed-memory machines},
  journal = 	 {ETRI Journal},
  year = 	 1998,
  volume =	 20,
  number =	 4,
  pages =	 {346--360},
  month =	 DEC,
  abstract = {It is challenging to parallelize problems with irregular
        computation and communication. In this paper, we propose an
        asynchronous algorithm for balancing unpredictable workload on
        distributed-memory machines. By using an initial workload
        estimate, we first partition the computations such that the
        workload is distributed evenly across the processors. In
        addition, we performtask migrations dynamically for adapting to
        the evolving workload. To demonstrate the usefulness of our load
        balancing strategy, we conducted experiments on an IBM SP2 and a
        Cray T3D. Experimental results show that our task migration
        strategy can balance unpredictable workload with little
        overhead.  Our code using C and MPI is portable onto other
        distributed-memory machines.}
}

@Article{Ber99:mpi-tools,
  author = 	 {M. Bertozzi},
  title = 	 {Tools for code optimization and system evaluation of
                  the image processing system {PAPRICA-3}}, 
  journal = 	 {Journal of Systems Architecture},
  year = 	 1999,
  volume =	 45,
  number =	 {6--7},
  pages =	 {519--542},
  month =	 JAN,
  abstract = {This paper presents the complex environment that was built to
        ease the prototyping of real-time applications on the PAPRICA-3
        massively parallel system. Applications are developed in C++
        using high level data types and the corresponding Assembly code
        is automatically created by a code generator. A stochastic code
        optimizer takes the assembly code and improves it according to a
        genetic approach; due to the high computational power required
        by thisapproach, the stochastic code optimizer was implemented
        with MPI and runs in parallel on a cluster of workstations. The
        availability of this complex environment allowed to test the
        performance of the system and to tune it according to some
        target applications before the actual development of the
        hardware. For this purpose a system-level simulator was also
        built to determine the number of clock cycles required to run a
        specific segment of code. The whole environment has been used to
        validate possible solutions for the hardware system and to
        develop, test, and tune several real-time image processing
        applications. The hardware system is now completely defined.}
}

@Article{Lee99:mpi-applicatin,
  author = 	 {P. C. S. Lee},
  title = 	 {On the parallelization of a global climate-chemistry
                  modeling system}, 
  journal = 	 {Atmospheric Environment},
  year = 	 1999,
  volume =	 33,
  number =	 4,
  pages =	 {675--681},
  month =	 FEB,
  abstract = {Coupled climate-chemistry simulations are computationally
        intensive owing to the spatial and temporal scope of the
        problem. In global chemistry models, the time integrations
        encountered in the chemistry and aerosol modules usually
        comprise the major CPU consumption. Parallelization of these
        segmentsof the code can contribute to multifold CPU speed-ups
        with minimal modification of the original serial code. This
        technical note presents a single program-multiple data (SPMD)
        strategy applied to the time-split chemistry modules of a
        coupled climate - global tropospheric chemistry model.
        Latitudinal domain decomposition is adopted along with a dynamic
        load-balancing technique that uses the previous time-step's
        load/latitude estimates for distributing the latitude bands
        amongst the processors. The coupled model is manually
        parallelized using the Message Passing Interface standard (MPI)
        on a distributed memory platform (IBM-SP2), Load-balancing
        efficiencies and the associated MPI overheads are discussed.
        Overall speed-ups and efficiencies are also calculated for a
        series of runs employing up to eight processors.}
} 

@Article{May99:mpi-application,
  author = 	 {F. May},
  title = 	 {Mathematical modelling of glass melting furnace
                  design with regard to {NOx} formation},
  journal = 	 {Glastechnische Berichte-Glass Science and Technology},
  year = 	 1999,
  volume =	 72,
  number =	 1,
  pages =	 {1--6},
  month =	 JAN,
  abstract = {A three-dimensional mathematical model for turbulent flow and
        combustion onthe basis of turbulence/chemistry interactions and
        radiative heat transfertaking into account spectral effects of
        surrounding walls and combustion gases is described. For this
        the transport equation for radiative intensity was split into
        different wavelength ranges. A block-structured finite volume
        grid with local refinements was used to solve the governing
        equations. The calculation domain is subdivided into a number of
        subdomains which are linked within the solver based on the
        Message Passing Interface library. Computed distributions of
        velocity, temperature, and heat fluxes are given. Results of a
        parametric study in a producing horseshoe furnace by increasing
        the height of the furnace with regard to NOx concentration
        distributions are presented.}
}

@Article{Reu99:mpi-application,
  author = 	 {J. Reuther},
  title = 	 {Aerodynamic shape optimization of supersonic
                  aircraft configurations via anadjoint formulation on
                  distributed memory parallel computers}, 
  journal = 	 {Computers and Fluids},
  year = 	 1999,
  volume =	 28,
  number =	 {4--5},
  pages =	 {675--700},
  month =	 MAY-JUN,
  abstract = {This work describes the application of a control theory-based
        aerodynamic shape optimization method to the problem of
        supersonic aircraft design. A high fidelity computational fluid
        dynamics (CFD) algorithm modelling the Euler equations is used
        to calculate the aerodynamic properties of complex
        three-dimensional aircraft configurations. The design process is
        greatly accelerated through the use of both control theory and
        parallel computing. Control theory is employed to derive the
        adjoint differential equations whose solution allows for the
        evaluation of design gradient information at a fraction of the
        computational cost required by previous design methods. The
        resulting problem is then implemented in parallel using a domain
        decomposition approach, an optimized communication schedule, and
        the Message Passing Interface (MPI) Standard for portability and
        efficiency. In our earlier studies, the serial implementation of
        this design method, was shown to be effective for the
        optimization of airfoils, wings, wing-bodies, and complex
        aircraft configurations using both the potential equation and
        the Euler equations. In this work, our concern will be to extend
        the methodologies such that the combined capabilities of these
        new technologies can be used routinely and efficiently in an
        industrial design environment. The aerodynamic optimization of a
        supersonic transport configuration is presented as a
        demonstration test case of the capability, A particular
        difficulty of this test case is posed by the close coupling of
        the propulsion/airframe integration.}
} 

@Article{Vat99:mpi-application,
  author = 	 {V. N. Vatsa},
  title = 	 {Parallelization of a multiblock flow code: an
                  engineering implementation},
  journal = 	 {Computers and Fluids},
  year = 	 1999,
  volume =	 38,
  number =	 {4--5},
  pages =	 {603--614},
  month =	 MAY-JUN,
  abstract = {Current trends in computer hardware are dictating a gradual
        shift toward the use of clusters of relatively inexpensive but
        powerful workstations, or massively parallel processing (MPP)
        machines, for scientific computing. However, most computational
        fluid dynamics (CFD) codes in use today were developed for
        large, shared-memory machines and are not readily portable to
        the distributed computing environment. One major hurdle in
        porting CFD codes to distributed computing platforms is the
        difficulty encountered in partitioning the problem so that the
        computation-to-communication ratio for each compute node
        (process) is maximized and the idle time during which one node
        waits for other nodes to transfer data is minimized. In the
        present work, pertinent issues involved in the parallelization
        of a widely used multiblock Navier-Stokes code TLNS3D are
        discussed. An engineering; approach is used here to parallelize
        this code so that minimal deviation from the original
        (nonparallel) code is incurred. A natural partitioning along
        grid blocks is adopted in which one or more blocks are
        distributed to each of the available nodes. An automatic, static
        load-balancing strategy is employed for equitable distribution
        of computational work to specified nodes. Both parallel Virtual
        machine (PVM) and message passing interface (MPI) protocols are
        incorporated for data communication to allow maximum portability
        to a wide range of computer configurations. Results are
        presented that are comparable with apriori estimates of
        performance for distributed computing and that are competitive
        in terms of central processing unit (CPU) time and wall time
        usagewith large, shared-memory supercomputers.}
}

@Article{Dzw99:mpi-application,
  author = 	 {W. Dzwinel},
  title = 	 {Method of particles in visual clustering of
                  multi-dimensional and large data sets},
  journal = 	 {Future Generation Computer Systems},
  year = 	 1999,
  volume =	 15,
  number =	 3,
  pages =	 {365--379},
  month =	 APR,
  abstract = {A method dedicated for visual clustering of N-dimensional data
        sets is presented. It is based on the classical feature
        extraction technique - the Sammon's mapping. This technique
        empowered by a particle approach used in the Sammon's criterion
        minimization makes the method more reliable, general and
        efficient. To show its reliability, the results of tests are
        presented, which were made to exemplify the algorithm 'immunity'
        from data errors. The general character of the method is
        emphasized and its role in multicriterial analysis discussed.
        Due to inherent parallelism of the methods, which are based on
        the particle approach, the visual clustering technique can be
        implemented easily in parallel environment. It is shown that
        parallel realization of the mapping algorithm enables the
        visualization of data sets consisting of more than 10(4)
        multi-dimensional data points. The method was tested in the PVM,
        MPI and data parallel environments on an HP/Convex SPP/1600. In
        this paper, the authors compare the parallel algorithm
        performance for these three interfaces. The approach to visual
        clustering, presented in the paper, can be used in visualization
        and analysis of large multi-dimensional data sets. }
}

@Article{Wan99:mpi-application,
  author = 	 {P. Wang},
  title = 	 {Parallel multigrid finite volume computation of
                  three-dimensional thermal convection},
  journal = 	 {Computers and Mathematics with Applications},
  year = 	 1999,
  volume =	 37,
  number =	 9,
  pages =	 {49-60},
  month =	 MAY,
  abstract = {A parallel implementation of the finite volume method for
        three-dimensional, time-dependent, thermal convective flows is
        presented. The algebraic equations resulting from the finite
        volume discretization, including a pressureequation which
        consumes most of the computation time, are solved by a parallel
        multigrid method. A flexible parallel code has been implemented
        on theIntel Paragon, the Cray T3D, and the IBM SP2 by using
        domain decompositiontechniques and the MPI communication
        software. The code can use 1D, 2D, or3D partitions as required
        by different geometries, and is easily ported toother parallel
        systems. Numerical solutions for air (Prandtl number Pr = 0.733)
        with various Rayleigh numbers up to 10(7) are discussed.}
}


@Article{Bar99:mpi-application,
  author = 	 {S. T. Barnard},
  title = 	 {An {MPI} implementation of the {SPAI} preconditioner
                  on the {T3E}}, 
  journal = 	 {International Journal of High Performance Computing
                  Applications},  
  year = 	 1999,
  volume =	 13,
  number =	 2,
  pages =	 {107--123},
  month =	 {Summer},
  abstract = {The authors describe and test spai-1.1, a parallel MPI
        implementation of the sparse approximate inverse (SPAI)
        preconditioner. They show that SPAI canbe very effective for
        solving a set of very large and difficult problems on a Cray
        T3E. The results clearly show the value of SPAI (and approximate
        inverse methods in general) as the Viable alternative to
        ILU-type methods when facing very large and difficult problems.
        The authors strengthen this conclusion by showing that spai-1.1
        also has very good scaling behavior.}
}

@Article{Ree99:mpi-application,
  author = 	 {J. S. Reeve},
  title = 	 {An efficient parallel version of the {Householder-QL}
                  matrix diagonalisation algorithm}, 
  journal = 	 {Parallel Computing},
  year = 	 1999,
  volume =	 25,
  number =	 3,
  pages =	 {311-319},
  month =	 MAR,
  abstract = {In this paper we report an effective parallelisation of the
        Householder routine for the reduction of a real symmetric matrix
        to tri-diagonal form and the QL algorithm for the
        diagonalisation of the resulting matrix. The Householder
        algorithm scales like alpha N-3/P + beta N(2)log(2)(P) and the
        QL algorithm like gamma N-2 + delta N-3/P as the number of
        processors P is increased for fixed problem size. The constant
        parameters alpha, beta, gamma anddelta are obtained empirically.
        When the eigenvalues only are required theHouseholder method
        scales as above while the QL algorithm remains sequential. The
        code is implemented in c in conjunction with the message passing
        interface (MPI) libraries and verified on a sixteen node IBM SP2
        and for realmatrices that occur in the simulation of properties
        of crystaline materials.}
}

@Article{Gen99:mpi-application,
  author = 	 {C. Gennaro},
  title = 	 {Parallelising the Mean Value Analysis algorithm},
  journal = 	 {Transactions of the Society for Computer Simulation
                  International}, 
  year = 	 1999,
  volume =	 16,
  number =	 1,
  pages =	 {16--22},
  month =	 MAR,
  abstract = {The Mean Value Analysis (MVA) algorithm is one of the most
        popular for evaluating the performance of separable (or
        product-form) queueing networks. Although its complexity is
        modest when jobs are indistinguishable, the introduction of
        different customer classes rapidly increases is computational
        cost. The problems of parallelising the algorithm while
        retaining its conceptual simplicity are examined. In particular,
        a parallel implementation of MVAon a distributed memory machine
        is developed using the MPI library for communication.}
}

@Article{Ble99:mpi-application,
  author = 	 {G. E. Blelloch},
  title = 	 {Design and implementation of a practical parallel
                  {D}elaunay algorithm}, 
  journal = 	 {Algorithmica},
  year = 	 1999,
  volume =	 24,
  number =	 {3--4},
  pages =	 {243--269},
  month =	 JUL-AUG,
  abstract = {Initial experiments using a variety of distributions showed that
        our parallel algorithm was within a factor of 2 in work from the
        best sequential algorithm. Based on these promising results, the
        algorithm was implemented using C and an MPI-based toolkit.
        Compared with previous work, the resulting implementation
        achieves significantly better speedups over good sequential
        code, does not assume a uniform distribution of points, and is
        widely portable due to its use of MPI as a communication
        mechanism. Results are presentedfor the IBM SP2, Cray T3D, SGI
        Power Challenge, and DEC AlphaCluster.}
}

@Article{Coe99:mpi-application,
  author = 	 {P. J. Coelho},
  title = 	 {Modelling of a utility boiler using parallel computing},
  journal = 	 {Journal of Supercomputing},
  year = 	 1999,
  volume =	 13,
  number =	 2,
  pages =	 {211-232},
  month =	 MAR,
  abstract = {A mathematical model for the simulation of the turbulent
        reactive flow and heat transfer in a power station boiler has
        been parallelized. The mathematical model is based on the
        numerical solution of the governing equations for mass,
        momentum, energy and transport equations for the scalar
        quantities.The k-epsilon model and the conserved
        scalar/prescribed probability density function formalism are
        employed. Radiative heat transfer is calculated using the
        discrete ordinates method. The code has been fully parallelized
        using the spatial domain decomposition approach and MPI.
        Calculations were performed using an IBM-SP2. It is shown that
        the computational requirements are reduced and the parallel
        efficiency increases if the mean temperature anddensity are
        calculated a priori, and stored. The role of the different parts
        of the code on the parallel performance is discussed. A speedup
        of 5.9 is achieved using 8 processors.}
}

@Article{Rus99:mpi-cluster,
  author = 	 {S. H. Russ},
  title = 	 {Using {Hector} to run {MPI} programs over networked
                  workstations}, 
  journal = 	 {Concurrency Practice and Experience},
  year = 	 1999,
  volume =	 11,
  number =	 4,
  pages =	 {189--204},
  month =	 APR,
  abstract = {Networked workstations represent an increasingly popular
        distributed platform for running large parallel programs. They
        can present a low-cost alternative to purchasing supercomputer
        time or additional usable computational capability, Several
        capabilities are desirable in order to harness workstations,
        including support for a widely accepted parallel programming
        environment, task migration, intelligent resource allocation,
        fault tolerance, and totally transparent support of these
        features. The Hector system is designed to provide these
        capabilities to MPI programs. The structure of the system and
        experiences using the system on loaded workstations to run
        scientific codes are described.}
}

@Article{Ros99:mpi-tool,
  author = 	 {T. Rossi},
  title = 	 {SIAM Journal on Scientific Computing},
  journal = 	 {A parallel fast direct solver for block tridiagonal
                  systems with separable matrices of arbitrary dimension},
  year = 	 1999,
  volume =	 20,
  number =	 5,
  pages =	 {1778-1796},
  month =	 MAY,
  abstract = {A parallel fast direct solution method for linear systems with
        separable block tridiagonal matrices is considered. Such systems
        appear, for example, when discretizing the Poisson equation in a
        rectangular domain using the five-point finite difference scheme
        or the piecewise linear finite elements ona triangulated,
        possibly nonuniform rectangular mesh. The method under
        consideration has the arithmetical complexity O(N log N), and it
        is closely related to the cyclic reduction method, but instead
        of using the matrix polynomial factorization, the so-called
        partial solution technique is employed. Hence, in this paper,
        the method is called the partial solution variant of the cyclic
        reduction method (PSCR method). The method is presented and
        analyzed in a general radix-q framework and, based on this
        analysis, the radix-4 variant is chosen for parallel
        implementation using the MPI standard. Thegeneralization of the
        method to the case of arbitrary block dimension is described.
        The numerical experiments show the sequential efficiency and
        numerical stability of the PSCR method compared to the
        well-known BLKTRI implementation of the generalized cyclic
        reduction method. The good scalability properties of the
        parallel PSCR method are demonstrated in a distributed-memory
        Cray T3E-750 computer.}
}

@Article{Bou99:mpi-algorithm,
  author = 	 {P. Boulet},
  title = 	 {Static tiling for heterogeneous computing platforms},
  journal = 	 {Parallel Computing},
  year = 	 1999,
  volume =	 25,
  number =	 5,
  pages =	 {547--568},
  month =	 MAY,
  abstract = {In the framework of fully permutable loops, tiling has been
        extensively studied as a source-to-source program
        transformation. However, little work hasbeen devoted to the
        mapping and scheduling of the tiles on physical processors.
        Moreover, targeting heterogeneous computing platforms has to the
        best of our knowledge, never been considered. In this paper we
        extend static tiling techniques to the context of limited
        computational resources with different-speed processors. In
        particular, we present efficient scheduling and mapping
        strategies that are asymptotically optimal. The practical
        usefulness of these strategies is fully demonstrated by MPI
        experiments on a heterogeneous network of workstations.}
}

@Article{Ros99:mpi-application,
  author = 	 {I. Rosenblum},
  title = 	 {Multi-processor molecular dynamics using the {Brenner}
                  potential: Parallelization of an implicit multi-body
                  potential},  
  journal = 	 {International Journal of Modern Physics C},
  year = 	 1999,
  volume =	 10,
  number =	 1,
  pages =	 {189--203},
  month =	 FEB,
  abstract = {We present computational aspects of Molecular Dynamics
        calculations of thermal properties of diamond using the Brenner
        potential. Parallelization was essential in order to carry out
        these calculations on samples of suitable sizes. Our
        implementation uses MPI on a multi-processor machine such as the
        IBM SP2. Three aspects of parallelization of the Brenner
        potential are discussed in depth. These are its long-range
        nature, the need for different parallelization algorithms for
        forces and neighbors, and the relative expense of force
        calculations compared to that of data communication. The
        efficiency of parallelization is presented as a function of
        different approaches to these issues as well as of cell size and
        number of processors employed in the calculation. In the
        calculations presented here, information from almosthalf of the
        atoms were needed by each processor even when 16 processors were
        used. This made it worthwhile to avoid unnecessary complications
        by making data from all atoms available to all processors.
        Superlinear speedup wasachieved for four processors (by avoiding
        paging) with 512 atom samples, and 5ps long trajectories were
        calculated (for 5120 atom samples) in 53 hours using 16
        processors; 514 hours would have been needed to complete this
        calculation using a serial program. Finally, we discuss and make
        available a set of routines that enable MPI-based codes such as
        ours to be debugged on scalar machines.}
}

@Article{Luo99:mpi-comparision,
  author = 	 {Y. Luo},
  title = 	 {Shared memory vs. message passing: the {COMOPS}
                  benchmark experiment}, 
  journal = 	 {Journal of Supercomputing},
  year = 	 1999,
  volume =	 13,
  number =	 3,
  pages =	 {283--301},
  month =	 MAY,
  abstract = {This paper presents the comparison of the COMOPS benchmark
        performance in MPI and shared memory on four different shared
        memory platforms: the DEC AlphaServer 8400/300, the SGI Power
        Challenge, the SGI Origin2000, and the HP-Convex Exemplar
        SPP1600. The paper also qualitatively analyzes the
        obtained performance data based on an understanding of the
        corresponding architecture and the MPI implementations. Some
        conclusions are made for the inter-processor communication
        performance on these four shared memory platforms.}
}


@Article{Hio99:mpi-application,
  author = 	 {S. Hioki},
  title = 	 {{QCDimMPI: MPI} code for {QCD} with an improved action},
  journal = 	 {Nuclear Physics B-Proceedings Supplements},
  year = 	 1999,
  volume =	 73,
  pages =	 {895--897},
  month =	 MAR,
  abstract = {QCDimMPI[I] is a simulation code for pure SU(3) gauge theory
        with an improved action consisting of 1 x 1 and 2 x 1
        plaquettes. It uses Fortran77 and the Message Passing Interface
        Standard, MPI[2]. QCDimMPI is an extended version of QCDMPI. It
        is portable, allows simulations in any number of dimensions, on
        any number of processors, and with arbitrary dimensional
        partitioning. It requires a rather small working area, and
        yields excellent performance on single processor computers and a
        wide variety of parallel computers which support MPI. The
        program provides information on link update time and
        communications time. In this paper, an outline of QCDimMPI is
        given, and benchmark results on several parallel computers are
        reported.}
}

 
@Article{Gol99:mpi-application,
  author = 	 {A. Goller},
  title = 	 {Parallel processing strategies for large {SAR} image
                  data sets in a distributed environment},
  journal = 	 {Computing},
  year = 	 1999,
  volume =	 62,
  number =	 4,
  pages =	 {277-291},
  abstract = {Key algorithms like image matching and Shape-from-Shading were
        parallelizedmainly using MPI, and ported onto suitable computer
        architectures. Our experiments showed that all algorithms
        perform well, and they further proved the concept of CDIP to be
        beneficial: Usability of all integrated algorithmswas
        significantly improved, mainly due to less user-centered network
        traffic, simple access to supercomputers, the creation of method
        sequences, and easy-to-use and well maintained algorithms.}
}
 
@Article{Chi99:mpi-implementation,
  author = 	 {A. Chien},
  title = 	 {Design and evaluation of an {HPVM}-based windows
                  {NT} supercomputer}, 
  journal = 	 {International Journal of High Performance Computing
                  Applications}, 
  year = 	 1999,
  volume =	 13,
  number =	 3,
  pages =	 {201--219},
  month =	 {Fall},
  abstract = {We describe the design and evaluation of a 192-processor Windows
        NT clusterfor high performance computing based on the High
        Performance Virtual Machine (HPVM) communication suite. While
        other clusters have been described in the literature, building a
        58 GFlop/s NT cluster to be used as a general-purpose production
        machine for NCSA required solving new problems. The HPVM
        software meets the challenges represented by the large number of
        processors,the peculiarities of the NT operating system, the
        need for a production-strength job submission facility and the
        requirement for mainstream programming interfaces. First, HPVM
        provides users with a collection of standard APIs like MPI,
        Shmem, Global Arrays with supercomputer class performance (13 mu
        s minimum latency, 84 MB/s peak bandwidth for MPI), efficiently
        delivering Myrinet's hardware performance to application
        programs. Second, HPVM provides cluster management and
        scheduling (through integration with Platform Computing's LSF).
        Finally, HPVM addresses Windows NT's remote access problem,
        providing convenient remote access and job control (through a
        graphical Java-applet front-end). Given the production nature of
        the cluster, the performance characterization is largely based
        on a sample of the NCSA scientific applications the machine will
        be running. The side-by-side comparison with other
        present-generation NCSA supercomputers shows the cluster to be
        within a factor of 2 to 4 of the SGI Origin 2000 and Cray T3E
        performance at a fraction of the cost. The inherent scalability
        of the cluster design produces a comparable or better speedup
        than the Origin 2000 despite a limitationin the HPVM flow
        control mechanism.}
}
 

@Article{Ros99:mpi-tools,
  author = 	 {T. Rossi},
  title = 	 {Parallel fictitious domain method for a non-linear
                  elliptic {Neumann} boundary value problem},
  journal = 	 {Numerical Linear Algebra with Applications},
  year = 	 1999,
  volume =	 6,
  number =	 1,
  pages =	 {51--60},
  month =	 JAN-FEB,
  abstract = {Parallelization of the algebraic fictitious domain method is
        considered forsolving Neumann boundary value problems with
        variable coefficients. The resulting method is applied to the
        parallel solution of the subsonic full potential flow problem
        which is linearized by the Newton method. Good scalability of
        the method is demonstrated on a Cray T3E distributed memory
        parallel computer using MPI in communication.}
}


@Article{Zak99:mpi-tools,
  author = 	 {O. Zaki},
  title = 	 {Toward scalable performance visualization with Jumpshot},
  journal = 	 {International Journal of High Performance Computing
                  Applications}, 
  year = 	 1999,
  volume =	 13,
  number =	 3,
  pages =	 {277-288},
  month =	 {Fall},
  abstract = {Jumpshot is a graphical tool for understanding the performance
        of parallel programs. It is in the tradition of the upshot tool
        but contains a number of extensions and enhancements that make
        it suitable for large-scale parallel computations. Jumpshot
        takes as input a new, more flexible logfile formatand comes with
        a library for generating such logfiles. An MPI profiling library
        is also included, enabling the automatic generation of such
        logfiles from MPI programs. Jumpshot is written in Java and can
        easily be integratedas an applet into browser-based computing
        environments. The most novel feature of Jumpshot is its
        automatic detection of anomalous durations, drawingthe user's
        attention to problem areas in a parallel execution. This
        capability is particularly useful in large-scale parallel
        computations containingmany events.}
}

@Article{BegVin99:transport,
  author = 	 {S. Bergeron and A. Vincent},
  title = 	 {Implementation strategies for real-time particle transport
                  solver},  
  journal = 	 {Computer Physics Communications},
  year = 	 1999,
  volume =	 120,
  number =	 {2--3},
  month =	 AUG,
  pages =	 {177-184},
  abstract = {Many problems in physics and engineering involve the transport
        of solid particles in a turbulent field. In some cases, it is
        desirable to study the transport of those particles in "real
        time". The prediction of erosion in therotating part of
        hydraulic turbines is such a problem. This paper presentsa
        semi-analytic predictor-corrector scheme adapted to the case of
        a rotating frame of reference. Simplification, related to the
        interpolation scheme required, is discussed as well as a
        parallel implementation using MPI on 10Base-T Ethernet
        interconnected workstations. The 3D solver is coupled with a
        high performance visualization software. Performance then shows
        a quasi-linear speedup.}
}


 
@Article{BruFagRes99:meta,
  author = 	 {M. A. Brune and G. E. Fagg and M. M. Resch},
  title = 	 {Message-passing environments for metacomputing},
  journal = 	 {Future Generation Computer Systems},
  year = 	 1999,
  volume =	 15,
  number =	 {5--6},
  month =	 OCT,
  pages =	 {699-712},
  abstract = {The PACX-MPI approach offers a transparent interface for the
        communication between two or more MPI environments. PVAMPI
        allows the user spawning parallel processes under the MPI
        environment. The PLUS protocol bridges the gap between
        vendor-specific (e.g., MPL, NX, and PARIX) and
        vendor-independent message-passing environments (e.g., PVM and
        MPI). Moreover, it offers the ability to create and control
        processes at application runtime.}
}

@Article{ResRanSto99:meta,
  author = 	 {M. M. Resch and D. Rantzau and R. Stoy},
  title = 	 {Metacomputing experience in a transatlantic wide area
                  application test-bed},
  journal = 	 {Future Generation Computer Systems},
  year = 	 1999,
  volume =	 15,
  number =	 {5--6},
  month =	 OCT,
  pages =	 {807--816},
  abstract = {In the frame of a G7 initiative the High Performance Computing
        Center Stuttgart (HLRS) together with the Pittsburgh
        Supercomputing Center (PSC) and Sandia National Laboratories
        (SNL) has set up a transatlantic wide area application test-bed
        in 1997. A dedicated ATM-Link was installed that connected
        German research networks to vBNS and ESnet. During 1 year this
        test-bed wasextensively used for metacomputing and collaborative
        working. Two applications - one from computational fluid
        dynamics and one from molecular dynamics - were adapted and run
        on the test-bed. For message-passing an MPI library was
        implemented that supports metacomputing. An already existing
        softwarefor collaborative visualization was adapted for that
        scenario. This article describes the technical background of the
        cooperation, results that have been achieved for the two
        applications so far and lessons that have been learned. Special
        emphasis will be given to future work planned.}
}


@Article{Tho99:mpi-application,
  author = 	 {S. J. Thomas and M. Desgagne and R. Benoit},
  title = 	 {A real-time north American forecast at 10-km
                  resolution with the {C}anadian {MC2 Meso-LAM}},
  journal = 	 {Journal of Atmospheric and Oceanic Technology},
  year = 	 1999,
  volume =	 16,
  number =	 8,
  pages =	 {1092-1101},
  month =	 AUG,
  abstract = {The next generation of high-performance computers will be based
        on clustersof shared-memory symmetric multiprocessor (SMP) nodes
        interconnected by a low-latency, high-bandwidth network. In this
        paper, the parallel performance of the nonhydrostatic Mesoscale
        Compressible Community (MC2) limited-areaatmospheric model on
        clusters of NEC SX-4 symmetric multiprocessor (SMP) nodes is
        presented. Several hybrid parallel-programming approaches are
        now possible with the SMP cluster SC-MC2 implementation based on
        internode MPI message-passing and intranode shared-memory
        tasking or threads. At total sustained execution rates of
        between 25 and 30 Gflop s(-1) on single-node or multinode
        clusters, it is now possible for the first time ever to generate
        a24-48-h real-time weather forecast over North America at 10-km
        resolution.}
}
 

@Article{Rod99:mpi-evals,
  author = 	 {J. L. Roda and C. Rodriguez and D. G. Morales and
                  E. Almeida}, 
  title = 	 {Predicting the execution time of message passing models},
  journal = 	 {Concurrency Practice and Experience},
  year = 	 1999,
  volume =	 11,
  number =	 9,
  month =	 AUG,
  pages =	 {461--477},
  abstract = {Recent publications prove that runtime systems oriented to the
        Bulk Synchronous Parallel Model usually achieve remarkable
        accuracy in their predictions, That accuracy can be seen in the
        capacity of the software for packing the messages generated
        during the superstep and their capability to find a
        rearrangement of the messages sent at the end of the superstep,
        Unfortunately, barrier synchronisation imposes some limits both
        in the range of available algorithms and in their performance,
        The asynchronous nature of many MPI/PVM programs makes their
        expression difficult or infeasible using a BSP oriented library.
        Through the generalisation of the concept of superstep we
        propose two extensions of the BSP model: the BSP Without
        Barriers (BSPWB) andthe Message Passing Machine (MPM) models,
        These new models are oriented toMPI/PVM parallel programming.
        The parameters of the models and their quality are evaluated on
        four standard parallel platforms, The use of these BSP
        extensions is illustrated using the Past Fourier Transform and
        the ParallelSorting by Regular Sampling algorithms.}
}

@Article{Lir99:mpi-apps,
  author = 	 {I. Lirkov and S. Margenov},
  title = 	 {{MPI} parallel implementation of {CBF} preconditioning for
                  {3D} elasticity problems},
  journal = 	 {Mathematics and Computers in Simulation},
  year = 	 1999,
  volume =	 50,
  number =	 {1--4},
  month =	 NOV,
  pages =	 {247--254},
  abstract = {New construction of a parallel algorithm for the discussed
        preconditioning method is proposed. The theoretical part of this
        study includes analysis ofthe execution time on various parallel
        architectures and asymptotic estimates of the parallel speedup
        and the parallel efficiency. The parallel performance estimates
        indicate that the proposed algorithm will be especially
        efficient on coarse-grain parallel systems, which is also
        confirmed by the numerical experiments. A portable MPI parallel
        code is developed. Numerical tests on three symmetric
        multiprocessor systems: SUN Enterprise 3000, SUN SPARCstation 10
        and Origin 2000 are presented. The reported speedup and parallel
        efficiency illustrate well the features of the proposed method
        and its implementation. }
}

@Article{den99:mpi-app,
  author = 	 {L. Deng and Z. S. Xie},
  title = 	 {Parallelization of {MCNP} Monte Carlo neutron and photon
                  transport code in parallel virtual machine and message
                  passing interface}, 
  journal = 	 {Journal of Nuclear Science and Technology},
  year = 	 1999,
  volume =	 36,
  number =	 7,
  month =	 JUL,
  abstract = {The coupled neutron and photon transport Monte Carlo code MCNP
        (version 3B)has been parallelized in parallel virtual machine
        (PVM) and message passing interface (MPI) by modifying a
        previous serial code. The new code has been verified by serving
        sample problems. The speedup increases linearly with the number
        of processors and the average efficiency is up to 99\% for
        12-processor.}

}

@Article{Arp99:mpi-app,
  author = 	 {K. Arpe and E. Roechner},
  title = 	 {Simulation of the hydrological cycle over Europe: Model
                  validation and impacts of increasing greenhouse gases},
  journal = 	 {Advances in Water Resources},
  year = 	 1999,
  volume =	 23,
  number =	 2,
  month =	 OCT,
  pages =	 {105--119},
      abstract = {Different methods of estimating precipitation area means, based
        on observations, are compared with each other to investigate
        their usefulness for model validation. For the applications
        relevant to this study the ECMWF reanalyses provide a good and
        comprehensive data set for validation. The uncertainties of
        precipitation analyses, based on observed precipitation or from
        numerical weather forecasting schemes, are generally in the
        range of 20\% but regionally much larger. The MPI atmospheric
        general circulation model is able to reproduce long term means
        of the main features of the hydrological cycle within the range
        of uncertainty of observational data, even for relatively small
        areas such as the Rhine river basin. Simulations with the MPI
        coupled general circulation model, assuming a further increase
        of anthropogenicgreenhouse gases, show clear trends in
        temperature and precipitation for the next century which would
        have significant implications for human activity, e.g. a further
        increase of the sea level of the Caspian Sea and less water in
        the Rhine and the Danube. We have gained confidence in these
        results because trends in the temperature and precipitation in
        the coupled model simulations up to the present are partly
        confirmed by an atmospheric model simulation forced with
        observed SSTs and by observational data. We gained further
        confidence because the simulations with the same coupled model
        but using constant greenhouse gases do not show such trends.
        However, doubts arisefrom the fact that these trends are strong
        where the systematic errors of the model are large.}
}

@Article{Yah99:mpi-app,
  author = 	 {Y. Yahagi and M. Mori and Y. Yoshii},
  title = 	 {The forest method as a new parallel tree method with the
                  sectional Voronoi tessellation},
  journal = 	 {Astrophysical Journal Supplement Series},
  year = 	 1999,
  volume =	 124,
  number =	 1,
  month =	 SEP,
  pages =	 {1--9},
  abstract = {We have developed a new parallel tree method which will be
        called the forest method hereafter. This new method uses the
        sectional Voronoi tessellation(SVT) for the domain
        decomposition. The SVT decomposes a whole space into polyhedra
        and allows their flat borders to move by assigning different
        weights. The forest method determines these weights based on the
        load balancingamong processors by means of the overload
        diffusion (OLD). Moreover, sinceall the borders are hat, before
        receiving the data from other processors, each processor can
        collect enough data to calculate the gravity force with
        precision. Both the SVT and the OLD are coded in a highly
        vectorizable manner to accommodate on vector parallel
        processors. The parallel code based onthe forest method with the
        Message Passing Interface is run on various platforms so that a
        wide portability is guaranteed. Extensive calculations with 15
        processors of Fujitsu VPP300/16R indicate that the code can
        calculate the gravity force exerted on 10(5) particles in each
        second for some ideal dark halo. This code is found to enable an
        N-body simulation with 10(7) or more particles for a wide
        dynamic range and is therefore a very powerful tool for the
        study of galaxy formation and large-scale structure in the
        universe.}
}

@Article{tan99:mpi-impl,
  author = 	 {H. Tang and K. Shen and T. Yang},
  title = 	 {Compile/run-time support for threaded {MPI} execution on
                  multiprogrammed shared memory machines},
  journal = 	 {ACM SIGPLAN Notices},
  year = 	 1999,
  volume =	 34,
  number =	 8,
  month =	 AUG,
  pages =	 {107--118},
  abstract = {MPI is a message-passing standard widely used for developing
        high-performance parallel applications. Because of the
        restriction in the MPI computationmodel, conventional
        implementations on shared memory machines map each MPInode to an
        OS process, which suffers serious performance degradation in the
        presence of multiprogramming, especially when a space/time
        sharing policyis employed in OS job scheduling In this paper, we
        study compile-time and run-time support for MPI by using threads
        and demonstrate our optimization techniques for executing a
        large class of MPI programs written in C. The compile-time
        transformation adopts thread-specific data structures to
        eliminate the use of global and static variables in C code. The
        run-time support includes an efficient point-to-point
        communication protocol based on a novellock-free queue
        management scheme. Our experiments on an SGI Origin 2000 show
        that our MPI prototype called TMPI using the proposed techniques
        is competitive with SGI's native MPI implementation in a
        dedicated environment, and it has significant performance
        advantages with up to a 23-fold improvement in a multiprogrammed
        environment.}
}

@Article{kie99:mpi-collective,
  author = 	 {T. Kielmann and R. F. H. Hofman and H. E. Bal and A. Plaat
                  and R. A. F. Bhoedjang},
  title = 	 {{MAGPIE: MPI}'s collective communication operations for
                  clustered wide area systems},
  journal = 	 {ACM SIGPLAN Notices},
  year = 	 1999,
  volume =	 34,
  number =	 8,
  month =	 AUG,
  pages =	 {131-140},
  abstract = {Writing parallel applications for computational grids is a
        challenging task. To achieve good performance, algorithms
        designed for local area networks must be adapted to the
        differences in link speeds. An important class of algorithms are
        collective operations, such as broadcast and reduce. We have
        developed MAGPIE, a library of collective communication
        operations optimizedfor wide area systems. MAGPIE's algorithms
        send the minimal amount of dataover the slow wide area links,
        and only incur a single wide area latency. Using our system,
        existing MPI applications can be run unmodified on
        geographically distributed systems. On moderate cluster sizes,
        using a wide area latency of 10 milliseconds and a bandwidth of
        1 MByte/s, MAGPIE executes operations up to 10 times faster than
        MPICH, a widely used MPI implementation; application kernels
        improve by up to a factor of 4. Due to the structure of our
        algorithms, MAGPIE's advantage increases for higher wide area
        latencies.}
}


@Article{zhu99:mpi-app,
  author = 	 {W. J. Zhu and L. Petzold},
  title = 	 {Parallel sensitivity analysis for {DAE}s with many
                  parameters}, 
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 1999,
  volume =	 11,
  number =	 10,
  month =	 AUG,
  pages =	 {571--585},
  abstract = {In this paper, we discuss the parallel computation of the
        sensitivity analysis of systems of differential-algebraic
        equations (DAEs) with a moderate number of state variables and a
        large number of sensitivity parameters, Several parallel
        implementations based on DASSLSO are explored and their
        performance when using the Message Passing Interface (MPI) on an
        SGI Origin 2000 is compared, }
}

@Article{Sun99:mpi-perf,
  author = 	 {D. Sundaram-Stukel and M. K. Vernon},
  title = 	 {Predictive analysis of a wavefront application using
                  {LogGP}}, 
  journal = 	 {ACM SIGPLAN Notices},
  year = 	 1999,
  volume =	 34,
  number =	 8,
  month =	 AUG,
  pages =	 {141-150},
  abstract = {This paper develops a highly accurate LogGP model of a complex
        wavefront application that uses MPI communication on the IBM
        SP/2. Key features of the model include: (1) elucidation of the
        principal wavefront synchronization structure, and (2) explicit
        high-fidelity models of the MPI-send and MPI-receive primitives.
        The MPI-send/receive models are used to derive L, o, and Gfrom
        simple two-node micro-benchmarks, Other model parameters are
        obtainedby measuring small application problem sizes on four SP
        nodes. Results show that the LogGP model predicts, in seconds
        and with a high degree of accuracy, measured application
        execution time for large problems running on 128 nodes. Detailed
        performance projections are provided for very large future
        processor configurations that are expected to be available to
        the application developers. These results indicate that scaling
        beyond one or two thousand nodes yields greatly diminished
        improvements in execution time, and thatsynchronization delays
        are a principal factor limiting the scalability of the
        application.}
}

@Article{kimura99:mpi-app,
  author = 	 {T. Kimura and H. Takemiya},
  title = 	 {Distributed parallel computing for fluid structure coupled
                  simulations on a heterogeneous parallel computer cluster},
  journal = 	 {International Journal of High Performance Computing
                  Applications}, 
  year = 	 1999,
  volume =	 13,
  number =	 4,
  pages =	 {320--333},
  abstract = {Distributed parallel computing for a fluid-structure coupled
        simulation hasbeen performed on a heterogeneous parallel
        computer cluster. The fluid andthe structure dynamics are
        simulated on different parallel computers connected by a
        high-speed local network. These dynamics are coupled by a loose
        coupling method exchanging the boundary data between the fluid
        and the structure domains through the network. The data
        communication among parallel computers is realized by using the
        new communication library, Stampi, which has been developed to
        enable communication in a heterogeneous environment. The
        performance evaluation on a heterogeneous parallel computer
        cluster has shown that the distributed parallel computing for
        fluid-structure coupled simulations has the advantage of
        increasing the performance compared with theparallel computing
        on a single parallel computer.}
}
 

@Article{morrow99:mpi-app,
  author = 	 {P. J. Morrow and D. Crookes and J. Brown and G. McAleese and
                  D. Roantree and I. Spence},
  title = 	 {Efficient implementation of a portable parallel programming
                  model for image processing},
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 1999,
  volume =	 11,
  number =	 11,
  month =	 SEP,
  pages =	 {671--685},
  abstract = {This paper describes a domain specific programming model for
        execution on parallel and distributed architectures. The model
        has initially been targeted at the application area of image
        processing, though the techniques developed may be more
        generally applicable to other domains where an algebraic
        orlibrary-based approach is common. Efficiency is achieved by
        the concept ofa self-optimising class library of primitive image
        processing operations, which allows programs to be written in a
        high level, algebraic notation andwhich is automatically
        parallelised (using an application-specific data parallel
        approach). The class library is extended automatically with
        optimised operations, generated by a transformation system,
        giving improved execution performance. The parallel
        implementation of the model described here is based on MPI and
        has been tested on a C40 processor network, a quad-processor
        Unix workstation, and a network of PCs running Linux. Timings
        are included to indicate the impact of the automatic
        optimisation facility (rather than the effect of
        parallelisation). }
}


@Article{byrne:mpi-app,
  author = 	 {G. D. Byrne and A. C. Hindmarsh},
  title = 	 {{PVODE}, an {ODE} solver for parallel computers},
  journal = 	 {International Journal of High Performance Computing
                  Applications},
  year = 	 1999,
  volume =	 13,
  number =	 4,
  pages =	 {354--365},
  abstract = {PVODE is a general-purpose solver for ordinary differential
        equation (ODE) systems that implements methods for both stiff
        and nonstiff systems. The code is designed for single-program
        multiple-data environments. It is writtenin ANSI standard C,
        with a highly modular structure. The version being distributed
        uses the message-passing interface (MPI) system for
        communication.In the stiff case, PVODE uses a backward
        differentiation formula method combined with preconditioned
        GMRES iteration. Parallelism is achieved by distributing the ODE
        solution vector into user-specified segments and parallelizing a
        set of vector kernels accordingly. For PDE-based ODE systems, we
        provide a module that generates a band block-diagonal
        preconditioner for use with the GMRES iteration. We also provide
        a set of interfaces to accommodateFortran applications. The
        paper includes a stiff example problem and test results on a
        Cray-T3D with three different message-passing systems. PVODE is
        publicly available.}
}


@Article{Coelho:mpi-app,
  author = 	 {P. J. Coelho},
  title = 	 {Parallel simulation of a utility boiler. Part {I}:
                  Mathematical model and numerical solution method}, 
  journal = 	 {Communications in Numerical Methods in Engineering},
  year = 	 1999,
  volume =	 15,
  number =	 10,
  month =	 OCT,
  pages =	 {717--726},
  abstract = {A computer code for the modelling of turbulent reactive flows
        with heat transfer has been parallelized and applied to the
        simulation of a utility boiler. The code is based on the
        numerical solution of the density-weighted averaged form of the
        governing equations for mass, momentum and energy conservation,
        and transport equations for scalars associated with the
        turbulence and combustion models. The k-epsilon model and the
        chemical equilibrium approach are used. The turbulent
        fluctuations are accounted for in the calculation of the mean
        properties by means of a presumed joint probability
        densityfunction for the mixture fraction and the fraction of
        radiative heat loss.The discrete ordinates method is used for
        radiation modelling. The governing equations are solved using
        the finite volume method. The parallelizationis carried out
        using the domain decomposition approach and the message-passing
        MPI library. The paper is divided into two parts. This part is
        concerned with the description of the model and the parallel
        implementation, whilethe model evaluation and the analysis of
        the parallel performance are presented in Part II (pp. 727-736).}
}


@Article{Torres:mpi-app,
  author = 	 {D. J. Torres and E. A. Coutsias},
  title = 	 {Pseudospectral solution of the two-dimensional
                  {N}avier-{S}tokes equations in a disk},
  journal = 	 {SIAM Journal on Scientific Computing},
  year = 	 1999,
  volume =	 21,
  number =	 1,
  month =	 SEP,
  pages =	 {378--403},
  abstract = {An efficient and accurate algorithm for solving the
        two-dimensional(2D) incompressible Navier-Stokes equations on a
        disk with no-slip boundary conditions is described. The
        vorticity-stream function formulation of these equations is
        used, and spatially the vorticity and stream functions are
        expressedas Fourier-Chebyshev expansions. The Poisson and
        Helmholtz equations whicharise from the implicit-explicit time
        marching scheme are solved as bandedsystems using a
        post-conditioned spectral tau-method. The polar
        coordinatesingularity is handled by expanding fields radially
        over the entire diameter using a parity modified Chebyshev
        series and building partial regularityinto the vorticity. The
        no-slip boundary condition is enforced by transferring one of
        the two boundary conditions imposed on the stream function
        ontothe vorticity via a solvability constraint. Significant
        gains in run timeswere realized by parallelizing the code in
        message passage interface (MPI).}
}


@Article{Ann99:mpi-app,
  author = 	 {V. Annamalai and C. S. Krishnamoorthy and V. Kamakoti},
  title = 	 {Adaptive finite element analysis on a parallel and
                  distributed environment},
  journal = 	 {Parallel Computing},
  year = 	 1999,
  volume =	 25,
  number =	 12,
  month =	 NOV,
  pages =	 {1413--1434},
  abstract = {Industries in general and automotive industries in particular,
        use Finite Element Analysis (FEA) for better solutions to the
        engineering problems theyencounter. The reliability of the
        Finite Element method can be improved toa larger extent by
        Adaptive Finite Element Analysis (AFEA), As we look towards
        increasingly accurate solutions, the process becomes
        computationally intensive and requires parallel and economic
        high-performance scientific computing environments to solve
        them. In this paper we present a parallel implementation of AFEA
        on a cluster of workstations and illustrate its efficiency and
        scalability with examples. In this process, we have developed a
        user-friendly environment for Parallel Distributed computing
        which is portable on top of both Parallel Virtual Machine (PVM)
        and Message Passing Interface(MPI) message passing layers. We
        have addressed the issues of the several stages in AFEA from a
        parallel computing perspective that includes Domain
        decomposition, Parallel Mesh generation, Parallel Finite Element
        Analysis using a Substructuring technique and Load balancing.}
}
 

@Article{Nagar99:mpi-impl,
  author = 	 {S. Nagar and A. Banerjee and A. Sivasubramaniam and
                  C. R. Das}, 
  title = 	 {Alternatives to coscheduling a network of workstations},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 1999,
  volume =	 59,
  number =	 2,
  month =	 NOV,
  pages =	 {302--327},
  abstract = {Efficient scheduling of processes on processors of a Network of
        Workstations (NOW) is essential for good system performance.
        However, the design of such schedulers is challenging because of
        the complex interaction between several system and workload
        parameters. Coscheduling, though desirable, is impractical for
        such a loosely coupled environment. Two operations, waiting for
        a message and arrival of a message, can be used to take remedial
        actions that can guide the behavior of the system toward
        coscheduling using local information. We present a taxonomy of
        three possibilities for each of these two operations. leading to
        a design space of 3x3 scheduling mechanisms. This paper presents
        an extensive implementation and evaluation exercise in studying
        these mechanisms. Adhering to the philosophy that scheduling and
        communication are intertwined and should be studied in
        conjunction, a complete communication substrate for UltraSPARC
        workstations, connected by Myrinet and running Solaris 2.5.1,
        has been developed. This platform provides the entire Message
        Passing Interface (MPI) to readily run off-the-shelf MPI
        applications by employing protected low-latency user-level
        messaging. Several applications can concurrently use this
        interface. This platform has been usedto design. implement, and
        uniformly evaluate nine scheduling strategies with a mixture of
        concurrent real applications with varying communication
        intensities. This includes five new schemes (Periodic Boost,
        Periodic Boost with Spin Block, Spin Yield, Periodic Boost with
        Spin Yield, Dynamic Coscheduling with Spin Yield) that are
        presented in this paper. In addition to our evaluations of the
        pms and cons of each mechanism in terms of throughput, response
        time, CPU utilization, and Fairness, it is shown that Periodic
        Boost is a promising approach for scheduling processes on a NOW.}
}


@Article{Lappa99:mpi-app,
  author = 	 {M. Lappa and R. Savino},
  title = 	 {Parallel solution of three-dimensional {M}arangoni flow in
                  liquid bridges}, 
  journal = 	 {International Journal for Numerical Methods in Fluids},
  year = 	 1999,
  volume =	 31,
  number =	 6,
  month =	 NOV,
  pages =	 {911--935},
  abstract = {This paper describes the implementation and performances of a
        parallel solver for the direct numerical simulation of the
        three-dimensional and time-dependent Navier-Stokes equations on
        distributed-memory, massively parallel computers. The
        feasibility of this approach to study Marangoni flow instability
        in half zone liquid bridges is examined. The results indicate
        that the incompressible, non-linear Navier-Stokes problem,
        governing the Marangoni flows behavior, can effectively be
        parallelized on a distributed memory parallel machine by
        remapping the distributed data structure. The numerical code is
        based on a three-dimensional Simplified Marker and Cell (SMAC)
        primitive variable method applied to a staggered finite
        difference grid. Using this method, the problem is split into
        two problems, one parabolic and the other elliptic A parallel
        algorithm, explicit in time, is utilized to solve the parabolic
        equations. A parallel multisplitting kernel is introduced for
        the solution of the pseudo pressure elliptic equation,
        representing the mosttime-consuming part of the algorithm. A
        grid-partition strategy is used inthe parallel implementations
        of both the parabolic equations and the multisplitting elliptic
        kernel. A Message Passing Interface (MPI) is coded for the
        boundary conditions; this protocol is portable to different
        systems supporting this interface for interprocessor
        communications. Numerical experiments illustrate good numerical
        properties and parallel efficiency. In particular, good
        scalability on a large number of processors can be achieved as
        long as the granularity of the parallel application is not too
        small. However, increasing the number of processors, the
        Speed-Up is ever smaller than the ideal linear Speed-Up. The
        communication timings indicate that complex practical
        calculations, such as the solutions of the Navier-Stokes
        equationsfor the numerical simulation of the instability of
        Marangoni flows, can beexpected to run on a massively parallel
        machine with good efficiency.}
}
 
@Article{hill99:mpi-app,
  author = 	 {R. W. Hill and K. S. Ball},
  title = 	 {Parallel implementation of a {F}ourier-{C}hebyshev
                  collocation method for incompressible fluid flow and heat
                  transfer},   
  journal = 	 {Numerical Heat Transfer Part B},
  year = 	 1999,
  volume =	 36,
  number =	 3,
  month =	 {Oct-Nov},
  pages =	 {309--329},
  abstract = { A Fourier-Chebyshev collocation spectral method is parallelized
        to simulatethe three-dimensional unsteady flow and heat transfer
        inside a cylindricalenclosure. Two solution approaches using
        different techniques for determining the pressure field and
        enforcing mass conservation are presented for shared memory
        applications using Cray directives and for distributed memory
        applications using MPI and SHMEM message passing libraries.
        Matrix diagonalization is employed for solving the pressure
        Poisson equation and Helmholtz equations for the velocity
        components and temperature. The parallelization approach is
        described and scaling results are presented for both platform
        types.}
}

   
@Article{poggi:mpi-extension,
  author = 	 {A. Poggi and G. Destri},
  title = 	 {{MPOOL}: an object-oriented library for task composition and
                  co-ordination}, 
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 1999,
  volume =	 11,
  number =	 14,
  month =	 DEC,
  pages =	 {835--848},
  abstract = {
        MPOOL is an object-oriented extension to the MPI library, based
        on three categories of objects, called units, groups and
        schemes. Units are active objects composed of data (state) and
        procedures (like traditional passive objects), but with the
        additional ability to store incoming messages in a queuewhile
        they are active and to send messages in parallel to other units;
        moreover, different units may be active simultaneously. Groups
        and schemes arepassive objects used for the composition of units
        and the co-ordination oftheir actions, Groups manage collective
        communications and synchronizationoperations such as barriers.
        Schemes compose units' actions through the use of a set of
        constructs derived by path expressions.}
}

@Article{sel99:mpi-app,
  author = 	 {P. M. Selwood and M. Berzins},
  title = 	 {Parallel unstructured tetrahedral mesh adaptation:
                  algorithms, implementation and scalability},
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 1999,
  volume =	 11,
  number =	 14,
  month =	 DEC,
  pages =	 {863--884},
  abstract = {
        The use of unstructured adaptive tetrahedral meshes in the
        solution of transient flows poses a challenge for parallel
        computing due to the irregular and frequently changing nature of
        the data and its distribution. A parallel mesh adaptation
        algorithm, PTETRAD, for unstructured tetrahedral meshes (based
        on the serial code TETRAD) is described and analysed. The
        portable implementation of the parallel code in C with MPI is
        described and discussed, The scalability of the code is
        considered, analysed and illustrated by numerical experiments
        using a shock wave diffraction problem. }
}

@Article{meme:mpi-graphics-app,
  author = 	 {D. Meneveaux and K. Bouatouch},
  title = 	 {Synchronisation and load balancing for parallel hierarchical
                  radiosity of complex scenes on a heterogeneous computer
                  network}, 
  journal = 	 {Computer Graphics Forum},
  year = 	 1999,
  volume =	 18,
  number =	 4,
  month =	 DEC,
  pages =	 {201--212},
  abstract = {In this paper ae propose a SPMD parallel hierarchical radiosity
        algorithm relying on a novel partitioning method which may
        apply, to any kind of archilectural scene. This algorithm is
        based on MPI (Message Passing Interface),a communication library
        which allows the use of either a heterogeneous setof concurrent
        computers or a parallel computer or both. The database is stored
        on a common directory and accessed by all the processors
        (through NFS in case of a network of computers). As the
        objective is to handle complex scenes such as building
        interiors, to cope with the problem of memory size, only a
        subset of the database resides in memory of each processor. This
        subset is determined with the help of a partitioning into 3D
        cells, clusteringand visibility calculations. A graph expressing
        visibility between the resulting clusters is determined
        partitioned (with a new method based on classification of
        K-means type) and distributed amongst all the processors.
        Eachprocessor is responsible for gathering energy (using the
        Gauss-Seidel method) only for its subset of clusters. In order
        to reduce the disk transfers due to downloading these subsets of
        clusters, we use an ordering strategy based on the traveling
        salesman algorithm. Dynamic load balancing relies on a task
        stealing approach while termination is detected by configuring
        the processors into a ring and moving a token around this ring.
        The parallel iterative resolution is of group iterative type.
        Its mathematical convergence is proven in the appendix.}
}

@Article{bova2000:mpi-app,
  author = 	 {S. W. Bova and G. F. Carey},
  title = 	 {A distributed memory parallel element-by-element scheme for
    semiconductor device simulation},
  journal = 	 {Computer Methods in Applied Mechanics and Engineering},
  year = 	 1999,
  volume =	 181,
  number =	 4,
  pages =	 {403--423},
  abstract = {
        A domain decomposition and parallel element-by-element (EBE)
        scheme is developed for semiconductor device simulation modeled
        by the drift-diffusion (DD) equations. A classical Gummel
        iterative decoupling of the potential and carrier transport
        equations is applied on an unstructured triangulation. The
        distributed memory EBE scheme is formulated for a Galerkin
        finite elementapproximation of the nonlinear Poisson problem,
        and a modified Scharfetter-Gummel method is used for the carrier
        transport problem. The resulting sequences of symmetric and
        nonsymmetric linear systems are solved via preconditioned Krylov
        methods. Unstructured triangular grids are used to permit
        grading of the mesh, which is then partitioned to processor
        subdomains with appropriate data structures for message passing.
        Details of the parallel algorithm and data structure are
        provided. The scheme is implemented in Fortran90 with MPI and
        performance results are presented for a representative MOSFET on
        an IBM SP, a CRAY T3E, and an SGI/CRAY Origin2000.}
}

@Article{bova2000:mpi-openmp-app,
  author = 	 {S. W. Bova and C. P. Breshears and C. E. Cuicchi and
                  Z. Demirbilek and H. A. Gabb},
  title = 	 {Dual-level parallel analysis of harbor wave response using
                  {MPI} and {OpenMP}},
  journal = 	 {International Journal of High Performance Computing
      Applications},
  year = 	 2000,
  volume =	 14,
  number =	 1,
  pages =	 {49--64},
  abstract = {The authors describe their experiences converting an existing
        serial production code to a parallel code combining both MPI and
        OpenMP. Such dual-levelparallel codes will be able to take full
        advantage of the emerging class of high performance computer
        architectures using small clusters of shared-memory processors
        connected via a message-passing network. While the focus
        isrestricted to a harbor response simulation code, the
        techniques presented herein are appropriate for a broad class of
        applications that explore a parameter space. The code
        modifications reduced the execution time of one testcase from
        3100 minutes on a single CPU to just over 12 minutes on 256
        CPUs. Results demonstrate that dual-level parallelism allows
        substantial increases in model resolution combined with
        improvements in simulation turnaroundtime but, contrary to
        conventional wisdom, requires very little source code alteration.}
}

@Article{park99:mpi-app,
  author = 	 {N. Park and V. K. Prasanna and C. S. Raghavendra},
  title = 	 {Efficient algorithms for block-cyclic array redistribution
                  between processor sets},
  journal = 	 {IEEE Transactions on Parallel and Distributed Systems},
  year = 	 1999,
  volume =	 10,
  number =	 12,
  month =	 DEC,
  pages =	 {1217--1240},
  abstract = {Run-time array redistribution is necessary to enhance the
        performance of parallel programs on distributed memory
        supercomputers. In this paper, we present an efficient algorithm
        for array redistribution from cyclic(x) on P processors to
        cyclic(Kx) on Q processors. The algorithm reduces the overall
        time for communication by considering the data transfer,
        communication schedule, and index computation costs. The
        proposed algorithm is based on a generalized circulant matrix
        formalism. Our algorithm generates a schedule thatminimizes the
        number of communication steps and eliminates node contentionin
        each communication step. The network bandwidth is fully utilized
        by ensuring that equal-sized messages are transferred in each
        communication step.Furthermore, the time to compute the schedule
        and the index sets is significantly smaller. It takes O(maz(P,
        Q)) time and is less than 1 percent of the data transfer time.
        In comparison, the schedule computation time using the
        state-of-the-art scheme (which is based on the bipartite
        matching scheme) is 10 to 50 percent of the data transfer time
        for similar problem sizes. Therefore, our proposed algorithm is
        suitable for run-time array redistribution. To evaluate the
        performance of our scheme, we have implemented the algorithm
        using C and MPI on an IBM SP2. Results show that our algorithm
        performs better than the previous algorithms with respect to the
        total redistribution time, which includes the time for data
        transfer. schedule, and indexcomputation.}

}

@Article{dan00:mpi-app,
  author = 	 {K. T. Danielson and S. Hao and W. K. Liu and R. A. Uras and
                  S. F. Li},
  title = 	 {Parallel computation of meshless methods for explicit
                  dynamic analysis}, 
  journal = 	 {International Journal for Numerical Methods in
      Engineering},
  year = 	 2000,
  volume =	 47,
  number =	 7,
  month =	 MAR,
  pages =	 {1323-1341},
  abstract = {A parallel computational implementation of modern meshless
        methods is presented for explicit dynamic analysis. The
        procedures are demonstrated by application of the Reproducing
        Kernel Particle Method (RKPM). Aspects of a coarse grain
        parallel paradigm are detailed for a Lagrangian formulation
        using model partitioning. Integration points are uniquely
        defined on separate processors and particle definitions are
        duplicated, as necessary, so that all support particles for each
        point are defined locally on the corresponding processor.
        Several partitioning schemes are considered and a reduced
        graph-based procedure is presented. Partitioning issues are
        discussed and procedures to accommodate essential boundary
        conditions in parallel are presented. Explicit MPI message
        passing statements are used for all communications among
        partitions on different processors. The effectiveness of the
        procedure is demonstrated by highly deformable inelastic example
        problems.}

}

@Article{mar00:mpi-app,
  author = 	 {N. Marco and S. Lanteri},
  title = 	 {A two-level parallelization strategy for Genetic Algorithms
                  applied to optimum shape design},
  journal = 	 {Parallel Computing},
  year = 	 2000,
  volume =	 26,
  number =	 4,
  month =	 MAR,
  pages =	 {377--397},
  abstract = {This pager presents a two-level strategy for the parallelization
        of a Genetic Algorithm (GA) coupled to a compressible flow
        solver designed on unstructured triangular meshes. The parallel
        implementation is based on MPI and makes use of the process
        group features of this environment. The resulting algorithm is
        used for the optimum shape design of aerodynamic
        configurations.Numerical and performance results are presented
        for the optimization of two-dimensional airfoils for
        calculations performed on the following systems:an SGI Origin
        2000 and an IBM SP-2 MIMD systems; an Pentium Pro (P6/200 MHz)
        cluster where the interconnection is realized through a
        FastEthernet (100 Mbits/s) switch. }
}


@Article{An00:mpi-app,
  author = 	 {R. E. Ansorge and T. A. Carpenter and L. D. Hall and
                  N. R. Shaw and G. B. Williams},
  title = 	 {Use of parallel supercomputing to design magnetic resonance
                  systems}, 
  journal = 	 {IEEE Transactions on Applied Superconductivity},
  year = 	 2000,
  volume =	 10,
  number =	 1,
  month =	 MAR,
  pages =	 {1368--1371},
  abstract = {Historically analytical methods have been the preferred approach
        to designing magnets and gradient sets for magnetic resonance
        systems. Such methods are computationally efficient but are
        approximate, particularly away from the axis of symmetry.
        Alternative methods, which are much more computationally
        intensive, for example Genetic Algorithms, are now becoming
        practical, Such methods have the advantage that they can be used
        for unconventional designs and for the inclusion of
        nonanalytical design constraints such as real-word engineering
        and cost limitations. Gradient coil designs have been published
        previously [1]-[3]. Now with the availability of more powerful
        computers, more ambitious designs can be undertaken using
        parallel computing methods. The use of a Hitachi SR2201
        supercomputer and clusters of Linux PCs (Beowulf) to develop a
        short whole body MRI magnet for clinical applications are
        reported on. An important feature of these computer codes is
        that they have been developed to run on parallel computing
        systems using the MPI message passing standard. MPI is an
        accepted industry standard, which means that these codes can
        readily be ported to different parallel computers. Previous
        success has been achieved in using MPI for a variety of other
        Medical Imaging problems [4].}
}

@InProceedings{cle95:mpi-debugging,
  author = 	 {C. Cl\'emen\,con and J. Fritscher and M. J. Meehan and
                  R. R\"uhl}, 
  title = 	 {An Implementation of Race Detection and Deterministic Replay
                  with {MPI}},
  booktitle = 	 {Proceedings of Euro-Par'95},
  number =	 966,
  series =	 {LNCS},
  year =	 1995,
  publisher =	 {Springer-Verlag},
  month =	 AUG,
  pages =	 {155-166},
  meetingloc =	 {Stockholm, Sweden}
}


@Article{danad00:mpi-app,
  author = 	 {K. T. Danielson and M. D. Adley},
  title = 	 {A meshless treatment of three-dimensional penetrator targets
                  for parallel computation},
  journal = 	 {Computational Mechanics},
  year = 	 2000,
  volume =	 25,
  number =	 3,
  month =	 MAR,
  pages =	 {267--273},
  abstract = {A meshless modeling procedure of three-dimensional targets for
        penetration analysis on parallel computing systems is described.
        Buried structures are modeled by arbitrary layers of concrete
        and geologic materials, and the projectile is modeled by
        standard finite elements. Penetration resistance of the buried
        structure is provided by functions derived from principles of
        dynamic cavity expansion. The resistance functions are
        influenced by the target material properties and projectile
        kinematics. Additional capabilities accommodate the varying
        structural and geometrical characteristics of the target.
        Coupling between the finite elements and the meshless target
        model is made by applying resistance loads to elements on the
        outer surface of the projectile mesh. Penetration experiments
        verify the approach. In this manner, the target is effectively
        modeled and the strategy is well suited for parallel processing.
        The procedure is incorporated into an explicit transient
        dynamics code, using mesh partitioning for a coarse grain
        parallel processing paradigm. Message Passing Interface (MPI) is
        used for all interprocessorcommunication. Large detailed finite
        element analyses of projectiles are performed on up to several
        hundred processors with excellent scalability. The efficiency of
        the strategy is demonstrated by analyses executed on several
        types of scalable computing platforms.}
}
 
@Article{kim00:mpi-app,
  author = 	 {S. Kim},
  title = 	 {Lattice {QCD} on a beowulf cluster},
  journal = 	 {Nuclear Physics B-Proceedings Supplements},
  year = 	 2000,
  volume =	 83,
  number =	 4,
  month =	 APR,
  pages =	 {807--809},
  abstract = {
        Using commodity component personal computers based on Alpha
        processor and commodity network devices and a switch, we built
        an 8-node parallel computer. GNU/Linux is chosen as an operating
        system and message passing libraries Such as PVM, LAM, and MPICH
        have been tested as a parallel programming environment. We
        discuss our lattice QCD project for a heavy quark system on this
        computer.}
}

@Article{wat00:mpi-app,
  author = 	 {N. Watari and S. Ohnishi and H. Onishi and Y. Iwasawa},
  title = 	 {Total energy estimation for {Pd/Al} bimetallic surfaces by a
                  parallel computation scheme},
  journal = 	 {Japanese Journal of Applied Physics Part 1---Regular Papers
      Short Notes \& Review Papers},
  year = 	 2000,
  volume =	 39,
  number =	 {3A},
  month =	 MAR,
  pages =	 {1457--1461},
  Abstract = {
        A numerical calculation scheme for the multicenter problem in
        large molecules and clusters is presented by applying the
        message-passing inter-face (MPI) in a massively parallel
        computer that uses the density functional method. The
        multicenter problem associated with the Coulomb singularity of
        an atom is efficiently treated by the parallel processors by
        allocating several atoms into each processor element (PE). The
        order N-2/P tuning is obtained for the Coulomb energy
        calculation by using the MPI which transfers Coulomb potential
        field between PE's. This method is applied to estimate the total
        energy of the reconstructed Al/Pd bimetallic surface. The energy
        estimationby the charge density of a superposition of isolated
        atomic charge fragments predict a stabilization caused by the
        reconstruction, being consistent with a self-consistent-field
        (SCF) cluster calculation of the bimetallic surface.}
}

@Article{rod00:mpi-model,
  author = 	 {C. Rodriguez and J. L. Roda and F. Sande and D. G. Morales
                  and F. Almeida},
  title = 	 {A new parallel model for the analysis of asynchronous
                  algorithms}, 
  journal = 	 {Parallel Computing},
  year = 	 2000,
  volume =	 26,
  number =	 6,
  month =	 MAY,
  pages =	 {753--767},
  abstract = {The BSP model barrier synchronization imposes some limits both
        in the rangeof available algorithms and also in their
        performance. Although BSP programs can be translated to MPI/PVM
        programs, the counterpart is not true. The asynchronous nature
        of some MPI/PVM programs does not easily fit inside theBSP
        model. Through the suppression of barriers and the
        generalization of the concept of superstep we propose two new
        models, the BSP-like and the BSPwithout barriers (BSPWB) models.
        While the BSP-like extends the BSP* modelto programs written
        using collective operations, the more general BSPWB model admits
        the MPI/PVM parallel asynchronous programming style. The
        parameters of the models and their quality are evaluated on four
        standard parallelplatforms: the Cray T3E, the IBM SP2, the
        Origin 2000 and the Digital Alpha Server 8400. The study shows
        that the time spent in an h-relation is moreindependent on the
        number of processors than on the communication pattern.We
        illustrate the use of these BSP extensions through two
        problem-solving paradigms: the Nested Parallel Recursive Divide
        and Conquer Paradigm and the Virtual Pipeline Dynamic
        Programming Paradigm. The proposed paradigms explain how nested
        parallelism and processor virtualization can be introduced in
        MPI and PVM without having any negative impact in the
        performance and model accuracy. The prediction of the
        communication times is robust even for problems, where
        communication is dominated by small messages. }
}

@Article{Lie00:mpi-app,
  author = 	 {C. C. Liew and T. Ikeshoji and N. Saito and H. Inomata},
  title = 	 {Domain-shifting algorithm: A new domain-decomposition scheme
                  for molecular dynamics simulations on parallel computers}, 
  journal = 	 {Progress of Theoretical Physics Supplement},
  year = 	 2000,
  number =	 138,
  pages =	 {205--210},
  abstract = {A domain is conventionally defined as a stationary sub-region of
        the simulated system in a domain-decomposition scheme for
        molecular dynamics (MD) simulations on parallel computers. We
        proposed an algorithm where all domains pre-assigned to
        processors are shifted to a particular direction, beyond the
        displacement of particles in the system during a time-step or a
        period ofsmall time-steps; as a result, it allows us to reduce
        the data transfer partners in the particle re-allocation
        procedure. We also proposed a systematic link-cell method that
        allows us to make use of small domain and reduces the amount of
        data to be transferred for updating the positions and forces of
        particles, in comparison to the conventional schemes. Benchmark
        studies of a three-dimensional Lennard-Jones system have been
        carried out using a parallel MD simulation program implemented
        via a MPI-based message-passing interface on several parallel
        computers. A result on a 16-CPU parallel computer system shows
        that the new scheme allows us to achieve a high parallel
        efficiency (over 75\%) for MD simulations of a system with
        relatively small number of particles per processor (N/P $<$ 500).}
}


@Article{decyk00:mpi-app,
  author = 	 {V. K. Decyk and D. E. Dauger and P. R. Kokelaar},
  title = 	 {Plasma physics calculations on a parallel {M}acintosh
                  cluster}, 
  journal = 	 {Physica Scripta},
  year = 	 2000,
  volume =	 {T84},
  pages =	 {85--88},
  abstract = {We have constructed a parallel cluster consisting of 16 Apple
        Macintosh G3 computers running the MacOS, and achieved very good
        performance on numerically intensive, parallel plasma
        particle-in-cell simulations. A subset of the MPI
        message-passing library was implemented in Fortran77 and C. This
        library enabled us to port code, without modification, from
        other parallel processors to the Macintosh cluster. For large
        problems where message packets are large and relatively few in
        number, performance of 50-150 MFlops/node ispossible, depending
        on the problem. This is fast enough that 3D calculations can be
        routinely done. Unlike Unix-based clusters, no special expertise
        in operating systems is required to build and run the cluster.
        Full detailsare available on our web site: 
        http://exodus.physics.ucla.edu/ appleseed/.}
}

@Article{ma99:mpi-app,
  author = 	 {S. B. Ma},
  title = 	 {Comparisons of the parallel preconditioners on the
                  {CRAY-T3E} for large nonsymmetric linear systems},
  journal = 	 {International Journal of High Speed Computing},
  year = 	 1999,
  volume =	 10,
  number =	 3,
  month =	 SEP,
  pages =	 {285--300},
  abstract = {In this paper we consider five types of parallel preconditioners
        for solving large sparse nonsymmetric linear systems on the
        CRAY-T3E. They are ILU(0)in the wavefront ordering, ILU(0) in
        the multi-coloring ordering, SSOR in the wavefront ordering, the
        SPAI(SParse Approximate Inverse) preconditioner, and finally
        Multi-color Block SOR preconditioner. The ILU(0) is known to be
        robust and the wavefront ordering naturally exploits the
        parallelism buthas a limited speedup due to the nonuniform
        lengths of the wavefronts. Multi-coloring is an efficient way of
        introducing the parallelism of order(N),where N is the order of
        the matrix but the convergence rate often deteriorates. The SPAI
        type preconditioner is inherently parallel and is gaining
        popularity. Finally, for the 5-point Laplacian matrix SOR method
        is known to have a nondeteriorating rate of convergence when the
        multi-coloring order is adopted. Also, Block SOR is expected to
        incur less communication overheads in a message-passing machine.
        Hence, Multi-Color Block SOR method is expected to have a good
        performance. Experiments were conducted for the Finite
        Difference discretizations of two problems with various
        meshsizes varying up to 1024 x 1024. MPI library was used for
        interprocess communications. Theresults show that ILU(0) in the
        multi-coloring ordering gives the best performance.}
}
 
@Article{pra00:mpi-sim,
  author = 	 {S. Prakash and E. Deelman and R. Bagrodia},
  title = 	 {Asynchronous parallel simulation of parallel programs},
  journal = 	 {IEEE Transactions on Software Engineering},
  year = 	 2000,
  volume =	 26,
  number =	 5,
  month =	 {MAY},
  pages =	 {385--400},
  abstract = {Parallel simulation of parallel programs for large datasets has
        been shown to offer significant reduction in the execution time
        of many discrete eventmodels. This paper describes the design
        and implementation of MPI-SIM, a library for the execution
        driven parallel simulation of task and data parallel programs.
        MPI-SIM can he used to predict the performance of existing
        programs written using MPI for message-passing, or written in
        UC, a data parallel language, compiled to use message-passing.
        The simulation models can beexecuted sequentially or in
        parallel. Parallel execution of the models aresynchronized using
        a set of asynchronous conservative protocols. This paper
        demonstrates how protocol performance is improved by the use of
        application-level, runtime analysis. The analysis targets the
        communication patternsof the application. We show the
        application-level analysis for message passing and data parallel
        languages. We present the validation and performanceresults for
        the simulator for a set of applications that include the NAS
        Parallel Benchmark suite. The application-level optimization
        described in this paper yielded significant performance
        improvements in the simulation of parallel programs, and in some
        cases completely eliminated the synchronizations in the parallel
        execution of the simulation model.}
}

@Article{gram00:mpi-alg,
  author = 	 {M. D. Grammatikakis and S. Liesche},
  title = 	 {Priority queues and sorting methods for parallel simulation},
  journal = 	 {IEEE Transactions on Software Engineering},
  year = 	 2000,
  volume =	 5,
  number =	 26,
  month =	 MAY,
  pages =	 {401--422},
  abstract = {We examine the design, implementation, and experimental analysis
        of parallel priority queues for device and network simulation.
        We consider: (1) distributed splay trees using MPI, (2) concurrent
        heaps using shared memory atomiclocks, and (3) a new, more
        general concurrent data structure based on distributed sorted
        lists, which is designed to provide dynamically balanced
        workallocation (with automatic or manual control) and efficient
        use of shared memory resources. We evaluate performance for all
        three data structures on a Cray-T3E900 system at KFA-Julich. Our
        comparisons are based on simulations of single buffers and a 64
        x 64 packet switch which supports multicasting. In all
        implementations, PEs monitor traffic at their preassigned
        input/output ports, while priority queue elements are
        distributed across the Cray-T3E virtual shared memory. Our
        experiments with up to 60,000 packets and twoto 64 PEs indicate
        that concurrent priority queues perform much better than
        distributed ones. Both concurrent implementations have
        comparable performance, while our new data structure uses less
        memory and has been further optimized. We also consider parallel
        simulation for symmetric networks by sorting integer conflict
        functions and implementing an interesting packet indexing
        scheme. The optimized message passing network simulator can
        process similar to 500K packet moves in one second, with an
        efficiency that exceeds similar to 50 percent for a few
        thousands packets on the Cray-TBE with 32 PEs. All developed
        data structures now form a parallel library. Although our
        concurrent implementations use the Cray-T3E ShMem library,
        portability can be derived from Open-MP or MPI-2 standard
        libraries, which will provide support for one-way communication
        and shared memory lock mechanisms.}
}

@Article{bad00:mpi-app,
  author = 	 {S. B. Baden and S. J. Fink},
  title = 	 {A programming methodology for dual-tier multicomputers},
  journal = 	 {IEEE Transactions on Software Engineering},
  year = 	 2000,
  volume =	 26,
  number =	 3,
  month =	 MAR,
  pages =	 {212--226},
abstract = {Hierarchically organized ensembles of shared memory
        multiprocessors possessa richer and more complex model of
        locality than previous generation multicomputers with single
        processor nodes. These dual-tier computers introduce many new
        factors into the programmer's performance model. We present a
        methodology for implementing block-structured numerical
        applications on dual-tier computers and a run-time
        infrastructure, called KeLP2, that implements the methodology.
        KeLP2 supports two levels of locality and parallelism via
        hierarchical SPMD control flow, run-time geometric meta-data,
        and asynchronous collective communication. KeLP applications can
        effectively overlap communication with computation under
        conditions where nonblocking point-to-point message passing
        fails to do so. KeLP's abstractions hide considerable detail
        without sacrificing performance and dual-tier applications
        written in KeLP consistently outperform equivalent single-tier
        implementations written in MPI. We describe the KeLP2 model and
        show how it facilitates the implementation of five
        block-structured applications specially formulated to hide
        communication latency on dual-tiered architectures. We support
        our arguments with empirical data from applications running on
        various single- and dual-tier multicomputers. KeLP2 supports a
        migration path from single-tier to dual-tier platforms and we
        illustrate this capability with a detailed programming example.}
}
 
@Article{gor00:mpi-theory,
  author = 	 {S. Gorlatch},
  title = 	 {Toward formally-based design of message passing programs},
  journal = 	 {IEEE Transactions on Software Engineering},
  year = 	 2000,
  volume =	 26,
  number =	 3,
  month =	 MAR,
  pages =	 {276--288},
  abstract={We present a systematic approach to the development of message
        passing programs. Our programming model is SPMD, with
        communications restricted to collective operations: scan,
        reduction, gather, etc. The design process in suchan
        architecture-independent language is based on
        correctness-preserving transformation rules, provable in a
        formal functional framework. We develop aset of design rules for
        composition and decomposition. For example, scan followed by
        reduction is replaced by a single reduction, and global
        reduction is decomposed into two faster operations. The impact
        of the design rules on the target performance is estimated
        analytically and tested in machine experiments. As a case study,
        we design two provably correct, efficient programs using the
        Message Passing interface (MPI) for the famous maximum segment
        sum problem, starting from an intuitive, but inefficient,
        algorithm specification.}
}
 
@Article{hos00:mpi-app,
  author = 	 {A. Hossinger and E. Langer and S. Selberherr},
  title = 	 {Parallelization of a {M}onte {C}arlo ion implantation
                  simulator},  
  journal = 	 {IEEE Transactions on Computer-Aided Design of Integrated
      Circuits and Systems},
  year = 	 2000,
  volume =	 19,
  number =	 5,
  month =	 MAY,
  pages =	 {560--567},
  abstract = {We present a parallelization method based on message passing
        interface (MPI) for a Monte Carlo program for two-dimensional
        and three-dimensional (3-D)simulation of ion implantations. We
        use a master-slave strategy where the master process
        synchronizes the slaves and performs the input-output
        operations, while the slaves perform the physical simulation.
        For this method thesimulation domain is geometrically
        distributed among several CPU's which have to exchange only very
        little information during the simulation. Thereby, the
        communication overhead between the CPU's is kept so low that it
        has almost no influence on the performance gain even if a
        standard network of workstations is used instead of a massively
        parallel computer to perform the simulation. We have optimized
        the performance gain by identifying bottlenecks of this strategy
        when it is applied to arbitrary geometries consisting of various
        materials. This requires the application of different physical
        models within the simulation domain and makes it impossible to
        determine a reasonable domain distribution before starting the
        simulation. Due to a feedback between master and slaves by
        on-line performance measurements, we obtain an almost linear
        performance gain on a cluster of workstations with just slightly
        varying processor loads. Besides the increase in performance,
        the parallelization method also achieves a distribution of the
        required memory.This allows 3-D simulations on a cluster of
        workstations, where each single machines would not have enough
        memory to perform the simulation on its own.}
}
 

@Article{lee00:mpi-app,
  author = 	 {J. Y. Lee and J. Pillardy and C. Czaplewski and Y. Arnautova
                  and D. R. Ripoll and A. Liwo and K. D. Gibson and
                  R. J. Wawak and H. A. Scheraga},
  title = 	 {Efficient parallel algorithms in global optimization of
                  potential energy functions for peptides, proteins, and
                  crystals}, 
  journal = 	 {Computer Physics Communications},
  year = 	 2000,
  volume =	 128,
  number =	 {1--2},
  month =	 JUN,
  pages =	 {399--411},
  abstract = {Global optimization is playing an increasing role in physics,
        chemistry, and biophysical chemistry. One of the most important
        applications of global optimization is to find the global minima
        of the potential energy of molecules or molecular assemblies,
        such as crystals. The solution of this problem typically
        requires huge computational effort. Even the fastest processor
        available is not fast enough to carry out this kind of
        computation in real time for the problems of real interest,
        e.g., protein and crystal structure prediction. One way to
        circumvent this problem is to take advantage of massively
        parallel computing. In this paper, we provide several examples
        of parallel implementations of global optimization algorithms
        developed in our laboratory. All of these examples follow the
        master/worker approach. Most of the methods are parallelized on
        the algorithmic (coarse-grain) level and oneexample of
        fine-grain parallelism is given, in which the function
        evaluation itself is computationally expensive. All parallel
        algorithms were initially implemented on an IBM/SP2
        (distributed-memory) machine. In all cases, however, message
        passing is handled through the standard Message Passing
        Interface (MPI); consequently the algorithms can also be
        implemented on any distributed- or shared-memory system that
        runs MPI. The efficiency of these implementations is discussed.}
}


@Article{Sri00:mpi-app,
  author = 	 {J. Srinivasan and Y. L. Volobuev and S. L. Mielke and
                  D. G. Truhlar}, 
  title = 	 {Parallel {F}ourier Path-integral {M}onte {C}arlo
                  calculations of absolute free energies and chemical
                  equilibria}, 
  journal = 	 {Computer Physics Communications},
  year = 	 2000,
  volume =	 128,
  number =	 {1--2},
  month =	 JUN,
  pages =	 {446--464},
  abstract = {We present a parallel implementation of the Fourier Path
        Integral Monte Carlo method for calculating the absolute free
        energies of many-body systems. The implementation adopts the
        message-passing paradigm for parallelization.with the use of the
        Message Passing Interface (MPI) libraries. A portable computer
        program, written using Fortran 90. has been developed and tested
        on a variety of platforms such as the SGI Origin, the IBM SP.
        and the Cray T3D and T3E. We have used the program to
        demonstrate the efficacy of importance sampling in configuration
        space. We have also used die program to calculate the partition
        function. and hence the absolute free energies, of triatomic
        molecules and four-body systems.}
}
 
@Article{pra99:mpi-app,
  author = 	 {B. Prameela and L. M. Patnaik},
  title = 	 {Parallel implementation of alternate quadrant interlocking
    factorisation method on star topology},  
  journal = 	 {International Journal of High Speed Computing},
  year = 	 1999,
  volume =	 10,
  number =	 4,
  month =	 DEC,
  pages =	 {361--378}, 
  abstract = {This paper discusses the parallel implementation of the solution
        of a set of linear equations using the Alternative Quadrant
        Interlocking Factorisation Methods (AQIF), on a star topology.
        Both the AQIF and LU decomposition methods are mapped onto star
        topology on an IBM SP2 system, with MPI as the internode
        communicator. Performance parameters such as speedup, efficiency
        have been obtained through experimental and theoretical means.
        The studies demonstrate (i) a mismatch of 15\% between the
        theoretical and experimental results, (ii) scalability of the
        AQIF algorithm, and (iii) faster executing AQIF algorithm.}
}
 

@Article{Roy00:mpi-app,
  author = 	 {S. Roy and R. Y. Jin and V. Chaudhary and W. L. Hase},
  title = 	 {Parallel molecular dynamics simulations of
                  alkane/hydroxylated alpha-aluminum oxide interfaces},
  journal = 	 {Computer Physics Communications},
  year = 	 2000,
  volume =	 128,
  number =	 {1--2},
  month =	 JUN,
  pages =	 {210--218}, 
  abstract = {In this paper we describe a practical implementation of parallel
        computation for the molecular dynamics (MD) simulation of an
        alkane/aluminum oxide interface. A serial MD program was
        converted into a parallel code utilizing the message passing
        interface (MPI). This code was evaluated on a twelve processor
        symmetrical multiprocessor as well as on a cluster of four
        processorSMPs. A maximum speedup of 5.25 was achieved with
        twelve processors on thelarge shared memory machine. The cluster
        performance saturated at a speedup of 4.5 with two nodes, High
        communication costs and considerable load imbalance in the
        system were identified as areas that need further investigation
        for obtaining better performance.}
}

 
@Article{fur00:mpi-app,
  author = 	 {T. R. Furlani and J. Kong and P. M. W. Gill},
  title = 	 {Parallelization of {SCF} calculations within {Q-Chem}},
  journal = 	 {Computer Physics Communications},
  year = 	 2000,
  volume =	 128,
  number =	 {1--2},
  month =	 JUN,
  pages =	 {170--177},
  abstract = {We have incorporated MPI based parallelism with dynamic fond
        balance into the Hartree-Fock and DFT modules of Q-Chem. A
        series of benchmark calculations consisting of both single point
        energy and gradient calculations were carried out to gauge the
        performance of the parallel modules. Calculations were carried
        out on two different parallel computers, namely a shared memory
        Silicon Graphics Origin2000 and a distributed memory Cray T3E,
        to show the flexibility of the code and demonstrate the great
        utility of MPI. Scalability for the DFT and Hartree-Fock modules
        is demonstrated for up to 64 processors.}
}        Science B.V. All rights reserved.


@Article{Fle00:mpi-app,
  author = 	 {G. D. Fletcher and M. W. Schmidt and M. S. Gordon},
  title = 	 {The Distributed Data Interface in {GAMESS}},
  journal = 	 {Computer Physics Communications},
  year = 	 2000,
  volume =	 128,
  number =	 {1--2},
  month =	 JUN,
  pages =	 {190--200},
  abstract = {The Distributed Data Interface to permit storage of large data
        arrays in the aggregate memory of distributed memory, message
        passing computer systems is described. The design of this
        relatively small library is discussed, in regard to its
        implementation over SHMEM, MPI-1. or socket based message
        libraries. The good performance of a MP2 program using DDI is
        demonstrated on both PC and workstation cluster computers, and
        some details of the resulting message traffic are presented.}
}
 

@Article{She00:mpi-app,
  author = 	 {A. I. Shestakov and M. K. Prasad and J. L. Milovich and
                  N. A. Gentile and J. F. Painter and G. Furnish},
  title = 	 {The radiation-hydrodynamic {ICF3D} code},
  journal = 	 {Computer Methods in Applied Mechanics and Engineering},
  year = 	 2000,
  volume =	 187,
  number =	 {1--2},
  pages =	 {181--200}, 
  abstract = {We describe the 3D high temperature plasma simulation computer
        code ICF3D which is being developed at the Lawrence Livermore
        National Laboratory. The code is portable; it runs on a variety
        of platforms: uniprocessors, SMPs, and MPPs. It parallelizes by
        decomposing physical space into disjoint subdomains and relies
        on message passing libraries such as MPI. ICF3D is written in
        the object oriented programming language C++. The mesh is
        unstructured and consists of a collection of hexahedra, prisms,
        pyramids, and/or tetrahedra. The hydrodynamics is modeled by the
        discontinuous finite element methodwhich allows a natural
        representation of inherently discontinuous phenomena such as
        shocks. Continuous processes such as diffusion are modeled by
        conventional finite element methods. ICF3D is modular and
        consists of separateequation-of-state, hydrodynamic, heat
        conduction, and multi-group radiation transport (diffusion
        approximation) packages. We present results on problems relevant
        to Inertial Confinement Fusion which are obtained on a varietyof
        computers, uniprocessors and MPPs.}
}
 
% Thanks to Jesper Larsson Traeff of CCRL NEC for the following
%
% Design 
%
@inproceedings{Hempel94,
author = {Hempel, Rolf},
title = "The {MPI} Standard for Message Passing",
booktitle = "High--Performance Computing and Networking, International
Conference and Exhibition, Proceedings, Volume II:
Networking and Tools",
editor = {Gentzsch, Wolfgang and Harms, Uwe},
publisher=sv,
series = lncs,
volume = 797,
pages = {247--252},
year = 1994
}

@inproceedings{Hempel94:uberblick,
author = "Hempel, Rolf",
title = "Der {M}essage {P}assing {I}nterface~--~{S}tandard: ein {{\"U}}berblick",
booktitle = "Praxisorientierte {P}arallelverarbeitung,
{B}eitr{{\"a}}ge zum 3. {W}orkshop {{\"u}}ber
{W}issenschaftliches Rechnen, {S}chwerpunkt
{P}raxixorientierte {P}arallelverarbeitung",
editor = "Horst Langend{{\"o}}rfer",
publisher = "Carl {H}anser {V}erlag",
address = "Braunschweig, Germany",
year = 1994
}

@inproceedings{Hempel96,
author = {Rolf Hempel},
title = "The Status of the {MPI} Message-Passing Standard and
Its Relation to {PVM}",
booktitle = "{P}arallel {V}irtual {M}achine -- {E}uro{PVM}'96",
editor = "Bode, Arndt and Dongarra, Jack and Ludwig, Thomas and
Sunderam, Vaidy",
publisher = sv,
series = lncs,
volume = 1156,
pages = {14--21},
year = 1996
}

@Article{HempelWalker99,
Author={Rolf Hempel and David W. Walker},
Title="The Emergence of the {MPI} Message Passing Standard for
Parallel Computing",
Journal = "{C}omputer {S}tandards \& {I}nterfaces",
Publisher = {Elsevier Science},
volume = 21,
year = 1999,
Pages =	{51--62}
}

%Implementation
%==============
%
%SX
%--

@inproceedings{Hempel96:mpisx,
author = "Hempel, Rolf",
title = "The {MPI} Message--Passing Standard and its Implementation
on the {NEC SX--4}",
booktitle = "Proceedings of the {NEC HPC} Workshop",
editor = "Doi, Shun",
address = "Tokyo, Japan",
year = "1996"
}

@inproceedings{HempelRitzdorfZimmermann97,
Author={Rolf Hempel and Hubert Ritzdorf and Falk Zimmermann},
Title="Implementation of {MPI} on {NEC}'s {SX-4} Multi-Node Architecture",
Booktitle={Recent Advances in Parallel Virtual Machine and
Message Passing Interface. 4th European {PVM/MPI} Users' Group Meeting},
publisher=sv,
Series=lncs,
Volume=1332,
Year=1997,
Pages={185--193},
}

@Article{HempelRitzdorfZimmermann98,
Author={Rolf Hempel and Hubert Ritzdorf and Falk Zimmermann},
Title="Efficient Message Passing Interface Implementations for
{NEC} Parallel Computers",
Journal={{NEC} Research \& Development},
Volume=39,
Number=4,
Year=1998,
Pages={408--413}
}

@inproceedings{TraffHempelRitzdorfZimmermann99,
Author={Jesper Larsson Tr{\"{a}}ff and
Rolf Hempel and Hubert Ritzdorf and Falk Zimmermann},
Title="Flattening on the fly: efficient handling of {MPI} derived datatypes",
Booktitle={Recent Advances in Parallel Virtual Machine and
Message Passing Interface. 6th European {PVM/MPI} Users' Group Meeting},
publisher=sv,
Series=lncs,
Volume=1697,
Year=1999
}

%Cluster etc.
%------------

@inproceedings{GolebiewskiBaumHempel99,
Author={\fontencoding{T1}\selectfont Maciej {Go\symbol{"AA}\symbol{"A6}biewski}
and Markus Baum and Rolf Hempel},
Title="High Performance Implementation of {MPI} for {Myrinet}",
Booktitle={Parallel Computation. 4th International Conference of the {ACPC}},
publisher=sv,
Series=lncs,
Volume=1557,
Year=1999,
Pages={510--521}
}

@inproceedings{GolebiewskiHempelTraff99,
Author={\fontencoding{T1}\selectfont Maciej {Go\symbol{"AA}\symbol{"A6}biewski}
and Rolf Hempel and Jesper Larsson Tr{\"{a}}ff},
Title="Algorithms for collective communication operations on {SMP} clusters",
Booktitle={The 1999 Workshop on Cluster-Based Computing held in conjunction with 13th {ACM-SIGARCH} International Conference on Supercomputing
{(ICS'99)}},
Pages={11--15},
Year=1999
}

@inproceedings{BaumGolebiewskiHempelTraff99,
Author={Markus Baum and
\fontencoding{T1}\selectfont Maciej {Go\symbol{"AA}\symbol{"A6}biewski}
and Rolf Hempel and Jesper Larsson Tr{\"{a}}ff},
Title="Dual-device {MPI} Implementation for {PC} Clusters with {SMP} Nodes",
Booktitle={{MPIDC'99} Message Passing Interface Developer's and User's
Conference Journal of Papers and Presentations},
Pages={53--60},
Year=1999
}

@inproceedings{GolebiewskiBasermannBaumHempelRitzdorfTraff99,
Author={\fontencoding{T1}\selectfont M. {Go\symbol{"AA}\symbol{"A6}biewski}
and A. Basermann and M. Baum and R. Hempel and H. Ritzdorf and J. L. Tr{\"{a}}ff},
Title="A {PC} Cluster with Application-Quality {MPI}",
Booktitle={Euro-Par'99 Parallel Processing},
publisher=sv,
Series=lncs,
Volume=1685,
Year=1999,
Pages={613--623},
}

%Tools
%=====

@inproceedings{HempelZimmermann96,
author = {R. Hempel and F. Zimmermann},
title = "On the automatic {PARMACS-to-MPI} transformation in application programs",
booktitle =    "High-performance computing and networking:
international conference and exhibition, {HPCN EUROPE}
1966, Brussels, Belgium, April 15--19, 1996:
proceedings",
publisher=sv,
series = lncs,
volume = 1067,
year = 1996,
pages = {1033--1034}
}

@Article{HempelZimmermann99,
author = {Hempel, Rolf and Zimmermann, Falk},
title = "Automatic Migration from {PARMACS} to {MPI} in Parallel {F}ortran Applications",
journal = "{S}cientific {P}rogramming",
volume = 20,
number = 7,
year =	1999,
pages =	{39--46}
}

@inproceedings{ReussnerTraffHunzelmann00,
Author={Ralf Reussner and Jesper Larsson Tr{\"{a}}ff and Gunnar Hunzelmann},
Title="A Benchmark for {MPI} Derived Datatypes",
Booktitle={Recent Advances in Parallel Virtual Machine and
Message Passing Interface. 7th European {PVM/MPI} Users' Group Meeting},
Series=lncs,
Year=2000,
Note={To appear}
}

@inproceedings{FahringerGerndtRileyTraff00,
Author={Thomas Fahringer and Michael Gerndt and Graham Riley and
Jesper Larsson Tr{\"{a}}ff},
Title="Specification of Performance Problems in {MPI} Programs with {ASL}",
Booktitle={International Conference in Parallel Processing {(ICPP'00)}},
Year=2000,
Note={To appear}
}

%Applications
%============

@inproceedings{Traff98,
Author={Jesper Larsson Tr{\"{a}}ff},
Title="Portable Randomized List Ranking on Multiprocessors using {{\sf MPI}}",
Booktitle={Recent Advances in Parallel Virtual Machine and
Message Passing Interface. 5th European {PVM/MPI} Users' Group Meeting},
publisher=sv,
Series=lncs,
Volume={1497},
Year=1998,
Pages={395--402}
}
 
%
% End of articles from NEC

@Article{bak00:mpi-app,
  author = 	 {J. Baker and M. Shirel},
  title = 	 {Ab initio quantum chemistry on PC-based parallel
                  supercomputers}, 
  journal = 	 {Parallel Computing},
  year = 	 2000,
  volume =	 26,
  number =	 {7--8},
  month =	 JUL,
  pages =	 {1011--1024},
  abstract  = {The advent of mass-market personal computers (PC) and the
        associated price reduction in virtually all computer components
        has brought the cost of parallel, multi-processor computers down
        to highly affordable levels. Four-, eight-, and even
        12-processor machines, constructed from basic, readily available
        PC components, can be obtained today for the same price as a
        good-quality single-processor workstation of a few years ago.
        Together with now well-established parallel tools (such as the
        message-passing interface (MPI) or parallel virtual machine (PVM)
        software), state-of-the-art, fully functioning, parallel
        machines using the Linux operating system and the latest PC
        microprocessors can deliver unprecedented price/performance
        ratios. This article reports on the capabilities and performance
        of a new, fully parallel ab initio program running on
        commercially available four- and eight-processor PC-based
        supercomputers.}
}

@Article{nob00:mpi-app,
  author = 	 {R. H. Nobes and A. P. Rendell and J. Nieplocha},
  title = 	 {Computational chemistry on {F}ujitsu vector-parallel
                  processors: Hardware and programming environment}, 
  journal = 	 {Parallel Computing},
  year = 	 2000,
  volume =	 26,
  number =	 {7--8},
  month =	 JUL,
  pages =	 {869--886}, 
  abstract = {In this and the following paper, we provide an introduction to
        the Fujitsu VPP range of vector-parallel supercomputers and to
        some of the computational chemistry software available for the
        VPP, Here, we consider the hardware and the design of software
        to exploit its capabilities. The VPP employs proprietary vector
        processors connected via a crossbar switch in a
        distributed-memory architecture. High single-node performance
        requires consideration of vector operand lengths, arithmetic
        pipe utilisation and memory-to-CPU bandwidth. Most parallel
        chemistry applications use either explicit 'message-passing' or
        a 'global-memory' paradigm, and benchmark results are presented
        for the communications performance of MPI, Linda and the Global
        Arrays.}
}
 

@Article{fru00:mpi-app,
  author = 	 {H. A. Fruchtl and R. H. Nobes and A. Bliznyuk},
  title = 	 {Performance of {MOPAC} on parallel computers},
  journal = 	 {Journal of Molecular Structure-Theochem},
  year = 	 2000,
  volume =	 506,
  number =	 {spec. SI},
  month =	 JUL,
  pages =	 {87--97},
  abstract = {Key parts of the semiempirical MOPAC program package have been
        ported to parallel computers using the MPI message
        passing-library. Parallel routines are available for the
        calculation of vibrational frequencies and electrostatic
        potentials, as well as for energies of large biomolecules via
        the linear-scaling MOZYME self-consistent-held method. The
        parallelisation strategiesused are discussed, and performance
        measurements for benchmark calculations on three different
        parallel computers are presented. Frequency and ESP calculations
        show good scaling for up to eight nodes, independent of
        hardwareand communications software. MOZYME calculations scale
        reasonably well if a fast implementation of MPI is available.}
}

@Article{geo00:mpi-impl,
  author = 	 {W. L. George and J. G. Hagedorn and J. E. Devaney},
  title = 	 {{IMPI}: Making {MPI} interoperable},
  journal = 	 {Journal of Research of the National Institute of Standards
      and Technology},
  year = 	 2000,
  volume =	 105,
  number =	 3,
  pages =	 {343+},
  month =	 {May-June},
  abstract = {The Message Passing Interface (MPI) is the de facto standard for
        writing parallel scientific applications in the message passing
        programming paradigm.Implementations of MPI were not designed to
        interoperate, thereby limitingthe environments in which parallel
        jobs could be run. We briefly describe a set of protocols,
        designed by a steering committee of current implementors of MPI,
        that enable two or more implementations of MPI to interoperate
        within a single application. Specifically, we introduce the set
        of protocols collectively called Interoperable MPI (IMPI). These
        protocols make use of novel techniques to handle difficult
        requirements such as maintaining interoperability among all IMPI
        implementations while also allowing for the independent
        evolution of the collective communication algorithms used in
        IMPI. Our contribution to this effort has been as a facilitator
        for meetings, editor of the IMPI Specification document, and as
        an early testbed for implementations of IMPI. This testbed is in
        the form of an IMPI conformance tester,a system that can verify
        the correct operation of an IMPI-enabled version of MPI.}
}
 
@TechReport{kon00:mpi-measurement,
  author = 	 {Alice E. Koniges and Rolf Rabenseifner and Karl Solchenbach},
  title = 	 {Benchmark Design for Characterization of Balanced
                  High-Performance Architectures},
  institution =  {},
  year = 	 2000
}


@Article{kanTam:mpi-app,
  author = 	 {R. Kanapady and K. K. Tamma},
  title = 	 {A unified family of generalized integration operators [GInO]
                  for non-linearstructural dynamics: implementation aspects},  
  journal = 	 {Advances in Engineering Software},
  year = 	 2000,
  volume =	 31,
  number =	 {8--9},
  pages =	 {639--647},
  month =	 {Aug-Sep},
  abstract = {
        The present paper proposes recent developments in theoretical
        and implementation aspects including parallel computations via a
        single analysis code ofa unified family of generalized
        integration operators [GInO] in time with particular emphasis on
        non-linear structural dynamics. The focus of this research is on
        the implementation aspects including the development of
        coarse-grained parallel computational models for such
        generalized time integration operators that he can readily
        ported to a wide range of parallel architectures via a
        message-passing paradigm (using MPI) and domain decomposition
        techniques. The implementation aspects are first described
        followed by an evaluation for a. range of problems which exhibit
        Large deformation, elastic,elastic-plastic dynamic behavior. For
        geometric non-linearity a total Lagrangian formulation and for
        material non linearity elasto-plastic formulations are employed.
        Serial and parallel performance issues on the SOI Origin 2000
        system are discussed and analyzed for illustration for selected
        schemes. For illustration, particular forms of [GInO] are
        investigated and a complete development via a single analysis
        code is currently underway. Nevertheless, this is the first time
        that such a capability is plausible and the developments further
        enhance computational structural dynamics areas.}
}
 

@Article{Gur00:mpi-app,
  author = 	 {G. P. Guruswamy},
  title = 	 {{HiMAP}: a portable super modular multilevel parallel
                  multidisciplinary process for large scale analysis},
  journal = 	 {Advances in Engineering Software},
  year = 	 2000,
  volume =	 31,
  number =	 {8--9},
  pages =	 {617--620},
  month =	 {Aug-Sep},
  abstract = {An efficient super modular process to simulate aeroelasticity of
        aerospace vehicles using high fidelity flow equations such as
        the Euler/Navier-Stokesequations is presented. The process is
        suitable for both tightly coupled and uncoupled analysis. The
        process is designed to execute on massively parallel processors
        (MPP) and work-station clusters based on a multiple-instruction,
        multiple-data (MIMD) architecture. The fluids discipline is
        parallelized using a zonal approach whereas the structures
        discipline is parallelized using the substructures concept.
        provision is also made to include controls domain. Computations
        of each discipline are spread across processors using IEEE
        standard message passing interface (MPI) for inter processor
        communications. Disciplines can run in parallel using a macro
        utility MPIRUN developed based on MPI. In addition to discipline
        parallelization and coarse-grain parallelization of the
        disciplines, embarrassingly parallel capability to run multiple
        parameter cases is implemented using a script system. The
        combined effect of three levels of parallelization is an almost
        linear scalability for multiple concurrent analyses that
        pet-form efficiently on MPP.}
}
 
@Article{cfkl00:mpi-java,
  author = 	 {B. Carpenter and G. Fox and S. H. Ko and S. Lim},
  title = 	 {Object serialization for marshaling data in a {J}ava
                  interface to {MPI}},
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 7,
  pages =	 {539--553},
  month =	 JUN,
  abstract = {Several Java bindings to Message Passing Interface (MPI)
        software have beendeveloped recently. Message buffers have
        usually been restricted to arrayswith elements of primitive
        type. We discuss adoption of the Java object serialization model
        for marshaling general communication data in MPI-like APIs, This
        approach is compared with a Java transcription of the standard
        MPI derived datatype mechanism. We describe an implementation of
        the mpiJava interface to MPI that incorporates automatic object
        serialization. Benchmark results confirm that current JDK
        implementations of serialization are not fast enough for high
        performance messaging applications. Means of solving this
        problem are discussed, and benchmarks for greatly improved
        schemes are presented. }
}

@Article{g-l00:mpi-app,
  author = 	 {A. J. Garcia-Loureiro and T. F. Pena and
                  J. M. Lopez-Gonzalez and L. Prat},
  title = 	 {Parallel finite element method to solve the 3{D} {P}oisson
                  equation and its application to abrupt heterojunction
                  bipolar transistors},
  journal = 	 {International Journal for Numerical Methods in Engineering},
  year = 	 2000,
  volume =	 49,
  number =	 5,
  pages =	 {639--652},
  month =	 OCT,
  abstract = {In this work we present a parallel solver for the Poisson
        equation for 3D abrupt heterojunction bipolar transistors (HBT).
        Three-dimensional simulation is essential for studying devices
        of small geometry as in the case we have studied. We have used
        an unstructured tetrahedral mesh and we have applied the finite
        method clement (FEM), making a specific formulation for the
        nodes located on the interface of the regions with different
        characteristics.For WET devices, it is necessary to take into
        account that on both sides of the interface between the
        different regions exist materials with different properties. Our
        formulation implies situating pairs of nodes in the same
        physical positions of the interface, associating each nodes to a
        region of the HBT. This way, the effects due to thermionic
        emission and the tunnel effect may be simulated when the Poisson
        and the electron and hole equations are solved in an abrupt HBT.
        We have applied domain decomposition methods to solve the
        associate linear systems. This code has been implemented for
        distributed memory multicomputers, making use of a message
        passing standard library, MPI.}
}


@Article{sch00:mpi-app,
  author = 	 {W. Schneider and P. J. McCarthy and K. Lackner and O. Gruber
                  and K. Behler and P. Martin and R. Merkel},
  title = 	 {{ASDEX} Upgrade {MHD} equilibria reconstruction on
                  distributed workstations},  
  journal = 	 {Fusion Engineering and Design},
  year = 	 2000,
  volume =	 48,
  number =	 {1--2},
  pages =	 {127--134},
  month =	 AUG,
  abstract = {The identification of MHD equilibrium states on the ASDEX
        Upgrade tokamak is a prerequisite for interpreting measurements
        from a wide range of diagnostics which are correlated with the:
        shape of the plasma. The availability in realtime of plasma
        parameters related to the MHD state is crucial for controlling
        the experiment. Function Parameterization is used as a standard
        tool to determine the position, shape, and other global
        parameters of the plasma as well as the MHD equilibrium flux
        surfaces. The recently developed interpretive equilibrium code
        CLISTE now enables the calculation of MHD equilibria on an
        intershot timescale. These calculations are parallelized by
        the use of a Message Passing Interface (MPI).}
}

@Article{ave00:mpi-app,
  author = 	 {A. Averbuch and B. Epstein and L. Ioffe and I. Yavneh},
  title = 	 {Efficient parallelization of a three-dimensional
                  {N}avier-{S}tokes solver on {MIMD} multiprocessors},
  journal = 	 {Journal of Supercomputing},
  year = 	 2000,
  volume =	 17,
  number =	 2,
  pages =	 {123--142},
  month =	 SEP,
  abstract = {The 3-D Navier-Stokes solver was implemented on three MIMD
        message-passing multiprocessors (a 64-processors IBM SP2, a
        20-processors MOSIX, and a 64-processors Origin 2000). The same
        code written with PVM and MPI software packages was executed on
        all the above distinct computational platforms. The examples in
        the paper demonstrate that we can achieve efficiency of about
        60\% for as many as 64 processors on Origin 2000 on a full-size
        3-D aerodynamic problem which is solved on realistic
        computational grids.}
}


@Article{vNie00:rmi-grid,
  author = 	 {R. van Nieuwpoort and J.Maassen and H. E. Bal and
                  T. Kielmann and R. Veldema},
  title = 	 {Wide-area parallel programming using the remote method
                  invocation model}, 
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 8,
  pages =	 {643--666},
  month =	 JUL,
  annote =	 {Special Issue?},
  abstract = {Java's support for parallel and distributed processing makes the
        language attractive for metacomputing applications, such as
        parallel applications that run on geographically distributed
        (wide-area) systems. To obtain actual experience with a
        Java-centric approach to metacomputing, we have built and used a
        highperformance wide-area Java system, called Manta, Manta
        implements the Java Remote Method Invocation (RMI) model using
        different communication protocols (active messages and TCP/IP)
        for different networks. The papershows how wide-area parallel
        applications can be expressed and optimized using Java RMI,
        Also, it presents performance results of several applications on
        a wide-area system consisting of four Myrinet-based clusters
        connected by ATM WANs, We finally discuss alternative programming
        models, namely object replication, JavaSpaces, and MPI for Java,}
}
 

@Article{pha00:mpi-app,
  author = 	 {S. Phadke and D. Bhardwaj and S. K. Dey},
  title = 	 {An explicit predictor-corrector solver with application to
                  seismic wave modelling}, 
  journal = 	 {Computers \& Geosciences},
  year = 	 2000,
  volume =	 26,
  number =	 {9--10},
  pages =	 {1053--1058},
  month =	 {Nov.-Dec.},
  abstract = {Wave-equation-based forward modelling using explicit
        finite-difference methods is a standard technique for
        calculating synthetic seismograms. The stability criterion
        restricts the size of the time step. In this paper a
        predictor-corrector method for solving the wave equation is
        described which allows the use of a larger time step. A
        stability analysis of the method is alsocarried out. Parallel
        implementation of the algorithm is described for a distributed
        computing environment which makes use of MPI and PVM message
        passing calls for communication between processors.}
}

@Article{oli00:mpi-app-compare,
  author = 	 {L, Oliker and R. Biswas},
  title = 	 {Parallelization of a dynamic unstructured algorithm using
                  three leading programming paradigms},
  journal = 	 {IEEE Transactions on Parallel and Distributed Systems},
  year = 	 2000,
  volume =	 11,
  number =	 9,
  pages =	 {931--940},
  month =	 SEP,
  abstract  =    {The success of parallel computing in solving real-life
        computationally intensive problems relies on their efficient
        mapping and execution on large-scale multiprocessor
        architectures. Many important applications are both unstructured
        and dynamic in nature, making their efficient parallel
        implementation a daunting task. This paper presents the
        parallelization of a dynamic unstructured mesh adaptation
        algorithm using three popular programming paradigms on three
        leading supercomputers. We examine an MPI message-passing
        implementation on the Cray T3E and the SGI Origin2000, a
        shared-memory implementation using the cache coherent nonuniform
        memory access (CC-NUMA) feature of the Origin2000, and a
        multithreaded version on the newly released Tera Multithreaded
        Architecture (MTA). We compare several critical factors of this
        parallel code development, including runtime, scalability,
        programmability, portability, and memory overhead. Our overall
        results demonstrate that multithreaded systems offer tremendous
        potential for quickly and efficiently solving some of the most
        challenging real-life problems on parallel computers.}
}
 

@Article{pro00:mpi-impl,
  author = 	 {B. V. Protopopov and A. Skjellum},
  title = 	 {Shared-memory communication approaches for an {MPI}
                  message-passing library},
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 9,
  pages =	 {799-820},
  month =	 AUG,
  abstract = {The contributions of this paper are three-fold. First, the
        authors present the taxonomy for shared-memory communication
        devices. Second, they show advantages and potential problems of
        the devices that belong to different classes of their taxonomy
        using the formulated design criteria. Third, they analyze
        communication performance of existing MPICH shared-memory
        devices, discuss optimizations of their performance, and show
        the performance gains that these optimizations yield. MPICH is
        used for comparison, since it is a widely used MPI
        implementation.}
}
 

@Article{dec00:mpi-app,
  author = 	 {T. Decker},
  title = 	 {Virtual data space - load balancing for irregular
                  applications}, 
  journal = 	 {Parallel Computing},
  year = 	 2000,
  volume =	 26,
  number =	 {13--14},
  pages =	 {1825--1860},
  month =	 DEC,
  abstract = {Load balancing is a key issue in the development of parallel
        algorithms with irregular structures. Existing load balancing
        systems each support only one specific programming paradigm and
        thus are of limited use. The system VDS presented here allows
        concurrent use of various paradigms such as fork-join, weighted
        tasks, and static dags (directed acyclic graphs that are knownin
        advance). The system provides visual performance evaluation
        tools to facilitate the efficient application of the system. VDS
        supports various communication interfaces including PVM and MPI.
        Thus, VDS-applications can be run on architectures ranging from
        workstation clusters to massively parallelsystems.}
}

@Article{duan00:mpi-app,
  author = 	 {S. Duan and K. S. Anderson},
  title = 	 {Parallel implementation of a low order algorithm for
                  dynamics of multibody systems on a distributed memory
                  computing system},  
  journal = 	 {Engineering with Computers},
  year = 	 2000,
  volume =	 16,
  number =	 2,
  pages =	 {96--108},
  abstract = {In this paper, a new hybrid parallelisable low order algorithm,
        developed by the authors for multibody dynamics analysis, is
        implemented numerically on a distributed memory parallel
        computing system. The presented implementation can currently
        accommodate the general spatial motion of chain systems, but key
        issues for its extension to general tree and closed loop systems
        are discussed. Explicit algebraic constraints are used to
        increase coarse grain parallelism, and to study the influence of
        the dimension of system constraint load equations on the
        computational efficiency of the algorithm for real parallel
        implementation using the Message Passing Interface (MPI). The
        equation formulation parallelism and linear system solution
        strategies which are used to reduce communication overhead are
        addressed. Numerical results indicate that the algorithm is
        scalable, that significant speed-up can beobtained, and that a
        quasi-logarithmic relation exists between time neededfor a
        function call and numbers of processors used. This result agrees
        well with theoretical performance predictions. Numerical
        comparisons with results obtained from independently developed
        analysis codes have validated thecorrectness of the new hybrid
        parallelisable low order algorithm, and demonstrated certain
        computational advantages.}
}

@Article{nam00:mpi-app,
  author = 	 {A. Namazifard and I. D. Parsons},
  title = 	 {An {MPI} parallel implementation of {N}ewmark's method},
  journal = 	 {Computer-Aided Civil and Infrastructure Engineering},
  year = 	 2000,
  volume =	 15,
  number =	 3,
  pages =	 {189--195},
  month =	 MAY,
  abstract = {The standard message-passing interface (MPI) is used to
        parallelize Newmark's method. The linear matrix equation
        encountered at each time step is solved using a preconditioned
        conjugate gradient algorithm. Data are distributed over the
        processors of a given parallel computer on a degree-of-freedom
        basis; this produces effective load balance between the
        processors and leads to a highly parallelized code. The
        portability of the implementation of this scheme is tested by
        solving some simple problems on two different machines: an SGI
        Origin2000 and an IBM SP2. The measured times demonstrate the
        efficiency of the approach and highlight the maintenance
        advantages that arise from using a standard parallel library
        such as MPI.}
}

@Article{chp00:prgm-devlp,
  author = 	 {B. Chapman and J. Merlin and D. Pritchard and F. Bodin and
                  Y. Mevel and T. Sorevik and L. Hill}, 
  title = 	 {Program development tools for clusters of shared memory
                  multiprocessors}, 
  journal = 	 {Journal of Supercomputing},
  year = 	 2000,
  volume =	 17,
  number =	 3,
  pages =	 {311--322},
  month =	 NOV,
  abstract = {Applications are increasingly being executed on computational
        systems that have hierarchical parallelism. There are several
        programming paradigms which may be used to adapt a program for
        execution in such an environment. In this paper, we outline some
        of the challenges in porting codes to such systems, and describe
        a programming environment that we are creating to support the
        migration of sequential and MPI code to a cluster of shared
        memory parallel systems, where the target program may include
        MPI, OpenMP or both. As part of this effort, we are evaluating
        several experimental approaches to aiding in this complex
        application development task.}
}
 
@Article{getov00:mpi-java,
  author = 	 {V. S. Getov and P. A. Gray and V. S. Sunderam},
  title = 	 {Aspects of portability and distributed execution for
                  {JNI}-wrapped message passing libraries},
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 11,
  pages =	 {1039--1050},
  month =	 SEP,
  abstract = {This paper discusses an approach which aims to provide legacy
        message passing libraries with Java-like portability in a
        heterogeneous, metacomputing environment, The results of such
        portability permit distributed computing components to be 'soft
        loaded' or 'soft-installed' in a dynamic fashion,
        ontocooperating resources for concurrent, synchronized parallel
        execution. This capability provides researchers with the ability
        to tap into a much larger resource pool and to utilize highly
        tuned codes for achieving performance, Necessarily, the Java
        programming language is a significant component. The Java Native
        Interface (JNI) is used to wrap message passing libraries
        written in other languages, and the bytecode which is generated
        for the front-end may be analyzed in order to completely
        determine the needs of the code which it wraps, This
        characterization allows the pre-configuration of a remote
        environment so as to be able to support execution. The
        usefulness of the portability gained by our approach is
        illustrated through examples showing the soft-installation of a
        process using an MPI computational substrate and the
        soft-installation of a process which requires a C-based
        communication library based upon the efficient multi-cast
        communication package, CCTL, The examples show that significant
        gains in performance can be achieved while allowing message
        passing execution to still, exhibit high levels of portability.}
}

@Article{smith00:mpi-openmp,
  author = 	 {L. Smith and P. Kent},
  title = 	 {Development and performance of a mixed {OpenMP/MPI} quantum
                  {M}onte {C}arlo code},
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 12,
  pages =	 {1121--1129},
  month =	 OCT,
  abstract = {The code has been rewritten to allow for an arbitrary mix of
        OpenMP and MPIparallelism. The various issues which arose during
        the parallelization arediscussed. The performance of the mixed
        OpenMP/MPI code has been assessed on an SGI Origin 2000 system
        and the results compared and contrasted to theoriginal MPI
        version.}
}

@Article{hotta00:mpi-app,
  author = 	 {A. Hotta and H. Ninokata and A. J. Baratta},
  title = 	 {Development of parallel coupling system between
                  three-dimensional nodal kinetic code {ENTREE} and two-fluid
                  plant simulator {TRAC/BF1}}, 
  journal = 	 {Journal of Nuclear Science and Technology},
  year = 	 2000,
  volume =	 37,
  number =	 10,
  pages =	 {840--854},
  month =	 OCT,
  abstract = {The high-speed three-dimensional neutron kinetic code ENTREE:
        was developedbased on the polynomial and semi-analytical
        nonlinear iterative nodal methods (PNLM and SANLM) with also
        introducing the discontinuity factor. In order to enhance the
        efficiency of transient calculation, the nonlinear
        correction-coupling coefficients are intermittently updated
        based on the changingrate of core state variables. By giving the
        analytical form for two-node problem matrix elements, the
        additional computing time in SANLM was minimized. A fast
        algorithm was developed for the multi table macro-cross section
        rebuilding process. The reactivity component model was
        implemented based on the variation of the neutron production and
        destruction terms. The code wascoupled with the two-fluid
        thermal hydraulic plant simulator TRAC/BF1 through PVM or MPI
        protocols. Two codes are executed in parallel with exchanging
        the feedback parameters explicitly. Based on the LMW PWR
        transient benchmark, it was shown that bath PNLM and SANLM spend
        less than 20\% excess computing time in comparison with the
        coarse mesh finite difference method (CFDM). The implementation
        of the discontinuity factor was verified based on theDVP
        problem. Adequacy and parallel efficiency of the coupling system
        TRAC/BF1-ENTREE was demonstrated based on the BWR cold water
        injection transientproposed by NEA/CRP.}
}
 
@Article{silva00:mpi-java,
  author = 	 {L. M. Silva and P. Martins and J. G. Silva},
  title = 	 {Heterogeneous parallel computing using {Java} and {WMPI}},
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 11,
  pages =	 {1077-1091},
  month =	 SEP,
  abstract = {In this paper, we present briefly the implementation of a Java
        interface for WMPI, a Windows-based implementation of MPI, Then,
        we describe a system that is oriented for Web-based computing
        and present a solution to integrateWMPI with this tool by making
        use of a Java bridge component and the Java bindings for WMPI,
        This solution allows the execution of meta-applications over a
        mixed configuration of platforms, execution models and
        programming languages. The resulting system provides a way to
        solve the problem of heterogeneity and to unleash the potential
        of diverse computational resources and programming tools.}
}


@Article{thir00:mpi-impl,
  author = 	 {G. K. Thiruvathukal and P. M. Dickens and S. Bhatti},
  title = 	 {Java on networks of workstations ({JavaNOW}): a
                  parallel computing framework inspired by {Linda} and
                  the {M}essage {P}assing {I}nterface ({MPI})}, 
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 11,
  pages =	 {1093--1116},
  month =	 SEP
}
 
@Article{thir00:mpi-java,
  author = 	 {G. K. Thiruvathukal and P. M. Dickens and S. Bhatti},
  title = 	 {Java on networks of workstations ({JavaNOW}): a parallel
                  computing framework inspired by {Linda} and the {M}essage
                  {P}assing {I}nterface ({MPI})}, 
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 11,
  pages =	 {1093--1116},
  month =	 SEP,
  Abstract = {JavaNOW provides a simple yet powerful framework for performing
        computationon networks of workstations. In addition to the Linda
        memory model, it provides for shared objects, implicit
        multithreading, implicit synchronization, object dataflow, and
        collective communications similar to those defined in MPI.
        JavaNOW is also a component of the Computational Neighborhood, a
        Java enabled suite of services for desktop computational
        sharing. The intent of JavaNOW is to present an environment for
        parallel computing that is both expressive and reliable and
        ultimately can deliver good to excellent performance. As JavaNOW
        is a work in progress, this article emphasizes the expressive
        potential of the JavaNOW environment and presents preliminary
        performance results only.}
}

@Article{carp00:mpi-java,
  author = 	 {B. Carpenter and V. Getov and G. Judd and A. Skjellum and
                  G. Fox},
  title = 	 {{MPJ: MPI}-like message passing for {Java}},
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 11,
  pages =	 {1019--1038},
  month =	 SEP,
  abstract = {Recently, there has been a lot of interest in using Java for
        parallel programming. Efforts have been hindered by lack of
        standard Java parallel programming APIs, To alleviate this
        problem, various groups started projects to develop Java message
        passing systems modelled on the successful Message Passing
        Interface (MPI), Official MPI bindings are currently defined
        only for C, Fortran, and C++, so early MPI-Iike environments for
        Java have been divergent. This paper relates an effort
        undertaken by a working group of the Java Grande Forum, seeking
        a consensus on an MPI-like API, to enhance the viability of
        parallel programming using Java.}
}

 
@Article{wall00:mpi-openmp,
  author = 	 {A. J. Wallcraft},
  title = 	 {{SPMD OpenMP} versus {MPI} for ocean models},
  journal = 	 {Concurrency-Practice and Experience},
  year = 	 2000,
  volume =	 12,
  number =	 12,
  pages =	 {1155-1164},
  month =	 OCT,
  Abstract = {OpenMP can be used in Single Program Multiple Data (SPMD) mode
        by spawning N threads in the main program and having each thread
        act from then on similarly to a process in MPI. The initial port
        of one ocean model to SPMD OpenMP revealed several
        incompatibilities between thread-based and process-basedSPMD
        coding styles. Adding support for threaded I/O was particularly
        painful, requiring modification to hundreds of lines of code.
        Several relativelyminor additions to the OpenMP API were
        identified that would greatly simplify SMPD programming.
        Meanwhile, an alternative Fortran compiler-based SPMDAPI,
        Go-Array Fortran, became available on the Cray T3E, There is a
        simplemapping from SHMEM put/get library calls onto co-array
        assignment statements, so adding Go-Array Fortran support to the
        ocean models was straightforward, To extend Go-Array Fortran to
        machines other than the Cray T3E, a subset of the language is
        automatically translated into SPMD OpenMP via a nawk script. The
        performance of the 'native' OpenMP and translated Go-Array
        Fortran versions of the ocean model was virtually identical, so
        the former has been replaced by the latter (which is much easier
        to maintain)}
}

@Article{qia00:mpi-app,
  author = 	 {J. Qiang and R. D. Ryne and S. Habib},
  title = 	 {Fortran implementation of object-oriented design in parallel
                  beam dynamics simulations},
  journal = 	 {Computer Physics Communications},
  year = 	 2000,
  volume =	 133,
  number =	 1,
  pages =	 {18--33},
  month =	 DEC,
  abstract = {In this paper, an object-oriented design for parallel beam
        transport simulations in accelerators is implemented using
        Fortran 30 (F90) with Message Passing interface (MPI) and High
        Performance Fortran (HPF). This improves themaintainability,
        reusability. and extensibility of software, combined withthe
        high performance of using MPI and the ease of parallel
        programming provided by HPF. The overhead associated with the
        object-oriented implementation has only a minor effect on
        performance.}
}
 
 
@Article{hu00:openmp,
  author = 	 {Y. C. Hu and H. H. Lu and A. L. Cox and W. .Zwaenepoel},
  title = 	 {{OpenMP} for networks of {SMP}s},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2000,
  volume =	 60,
  number =	 12,
  pages =	 {1512--1530},
  month =	 DEC,
  abstract = {We present performance results for seven applications
        (Barnes-Hut, CLU, andWater from SPLASH-2, 3D-FFT from NAS,
        Red-Black SOR, TSP, and MGS) runningon an SP2 with four
        four-processor SMP nodes. A comparison between the thread
        implementation and the original implementation of TreadMarks
        shows thatusing the hardware shared memory within an SMP node
        significantly reduces the amount of data and the number of
        messages transmitted between nodes andconsequently achieves
        speedups that are up to 30\% better than the originalversions. We
        also compare SDSM against message passing. Overall, the speedups
        or multithreaded TreadMarks programs are within 7-30\% of the MPI
        versions.}
}

@Article{kry01:mpi-app,
  author = 	 {P. Krysl and Z. Bittnar},
  title = 	 {Parallel explicit finite element solid dynamics with domain
                  decomposition and message passing: dual partitioning
                  scalability},  
  journal = 	 {Computers and Structures},
  year = 	 2001,
  volume =	 79,
  number =	 3,
  pages =	 {345--360},
  month =	 JAN,
  Abstract = {We document not only the high-level algorithms but also the
        relevant communication code fragments of the message passing
        implementation using the MPI library, so as to empower the
        reader to fully verify our numerical experiments.}
}

@Article{leg00:mpi-applibs,
  author = 	 {P. F. Leggett and S. P. Johnson and M. Cross},
  title = 	 {{CAPLib} - a `thin layer' message passing library to support
    computational mechanics codes on distributed memory parallel systems},
  journal = 	 {Advances in Engineering Software},
  year = 	 2000,
  volume =	 32,
  number =	 1,
  pages =	 {61--83},
  month =	 DEC
}

@Article{sad01:mpi-app,
  author = 	 {M. Sadeghi and F. Liu},
  title = 	 {Computation of mistuning effects on cascade flutter},
  journal = 	 {AIAA Journal},
  year = 	 2001,
  volume =	 39,
  number =	 1,
  pages =	 {22--28},
  month =	 JAN,
  Abstract = {A computational method is described for predicting Butter of
        turbomachinerycascades with mistuned blades. The method solves
        the unsteady Euler/Navier-Stokes equations for multiple-blade
        passages on a parallel computer using the message passing
        interface. A secund-order implicit scheme with dual
        time-stepping and multigrid is used. Each individual blade is
        capable of moving with its own independent frequeucy and phase
        angle, thus modeling a cascade with mistuned blades. Flutter
        predictions are performed through the energy method, Both
        phase-angle and frequency mistuning are studied, It is found
        that phase-angle mistuning has little effect on stability,
        whereas frequency mistuning significantly changes the
        aerodynamic damping, The important effect of frequency mistuning
        is to average out the aerodynamic damping of the tuned blade row
        over the whole range of interblade phase angles (IBPA).If a
        tuned blade row is stable over most of the IBPA range, the
        blades canbe stabilized fur the complete IBPA range through
        appropriate frequency mistuning.}
}

 
@Article{gull01:mpi-app,
  author = 	 {A. S. Gullerud and R. H. Dodds},
  title = 	 {{MPI}-based implementation of a {PCG} solver using an {EBE}
                  architecture and preconditioner for implicit, 3-{D} finite
                  element analysis}, 
  journal = 	 {Computers and Structures},
  year = 	 2001,
  volume =	 79,
  number =	 5,
  pages =	 {553--575},
  month =	 FEB,
  Abstract = {This work describes a coarse-grain parallel implementation of a
        linear preconditioned conjugate gradient solver using an
        element-by-element architecture and preconditioner for
        computation. The solver, implemented within a nonlinear.
        implicit finite element code, uses an MPI-based message-passing
        approach to provide portable parallel execution on shared,
        distributed, and distributed-shared memory computers. The
        flexibility of the element-by-element approach permits a
        dual-level mesh decomposition; a coarse, domain-level
        decomposition creates a load-balanced domain for each processor
        for parallel computation, while a second level decomposition
        breaks each domain into blocks of similar elements (same
        constitutive model- order of integration, element type) for
        fine-grained parallel computation on each processor. The key
        contribution here is a new parallel implementation of the
        Hughes-Winget (HW) element-by-element preconditioner suitable
        for arbitrary, unstructuredmeshes. The implementation couples an
        unstructured dependency graph with anew balanced graph-coloring
        algorithm to schedule parallel computations within and across
        domains. The code also includes the diagonal preconditionerand a
        modern parallel (threaded) sparse direct solver for comparison,
        Three example problems with up to 158,000 elements and 180,000
        nodes analyzed on an SGI/Cray Origin 2000 illustrate the
        parallel performance of the algorithms and preconditioners,
        Analyses with varying block sizes illustrate thatthe two-level
        decomposition improves overall execution speed with the block
        size tuned for the cache memory architecture of the executing
        platform. This implementation of the HW preconditioner shows
        reasonable parallel efficiency - typically 80\%, on 48
        processors. Efficiency for the diagonal preconditioner is also
        high, with total speedups reaching 86\% on 48 CPUs. Calculation
        of the tangent element stiffnesses shows superlinear speedups
        for each of the test problems, while the computation of
        strains/stresses/residual forces shows 80\% parallel efficiency
        on 48 processors.}
}



@Article{scot01:mpi-app,
  author = 	 {J. A. Scott},
  title = 	 {A parallel frontal solver for finite element applications},
  journal = 	 {International Journal for Numerical Methods in Engineering},
  year = 	 2001,
  volume =	 50,
  number =	 5,
  pages =	 {1131--1144},
  month =	 FEB,
      Abstract = {In finite element simulations, the overall computing time is
        dominated by the time needed to solve large sparse linear
        systems of equations. We reporton the design and development of
        a parallel frontal code that can significantly reduce the
        wallclock time needed for the solution of these systems. The
        algorithm used is based on dividing the finite element domain
        into subdomains and applying the frontal method to each
        subdomain in parallel. The so-called multiple front approach is
        shown to reduce the amount of work and memory required compared
        with the frontal method and, when run on a small number of
        processes, achieves good speedups. The code, HSL_MP42, has been
        developed for the Harwell Subroutine Library
        (http://www.numerical.rl.ac.uk/hsl). It is written in Fotran 90
        and, by using MPI for message passing, achieves portability
        across a wide range of modem computer architectures.}
}

@Article{alta01:mpi-eval,
  author = 	 {K. Al-Tawil and C. A. Moritz},
  title = 	 {Performance modeling and evaluation of {MPI}},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2001,
  volume =	 61,
  number =	 2,
  pages =	 {202--223},
  abstract = {Users of parallel machines need to have a good grasp for how
        different communication patterns and styles affect the
        performance of message-passing applications. LogGP is a simple
        performance model that reflects the most important parameters
        required to estimate the communication performance of parallel
        computers. The message passing interface (MPI) standard provides
        new opportunities for developing high performance parallel and
        distributed applications. In this paper, we use LogGP as a
        conceptual framework for evaluating the performance of MPI
        communications on three platforms: Gray-Research T3D, Convex
        Exemplar 1600SP, and a network of workstations (NOW). We
        developa simple set of communication benchmarks to extract the
        LogGP parameters. Our objective in this is to compare the
        performance of MPI communication onseveral platforms and to
        identify a performance model suitable for MPI performance
        characterization. In particular, two problems are addressed: how
        LogGP quantifies MPI performance and what extra features are
        required for modeling MPI, and how MPI performance compare on
        the three computing platforms: Gray Research T3D, Convex
        Exemplar 1600SP, and workstations clusters.}
}

@Article{grif00:mpi-app,
  author = 	 {L. W. Griffin and D. J. Dorney},
  title = 	 {Simulations of the unsteady flow through the Fastrac
                  supersonic turbine}, 
  journal = 	 {Journal of Turbomachinery-Transactions of the ASME},
  year = 	 2000,
  volume =	 122,
  number =	 2,
  pages =	 {225--233},
  month =	 APR,
  abstract = {Analysis of the unsteady aerodynamic environment in the Fastrac
        supersonic turbine is presented. Model analysis of the turbine
        blades indicated possible resonance in crucial operating ranges
        of the turbopump. Unsteady computational fluid dynamics (CFD)
        analysis was conducted to support the aerodynamic and structural
        dynamic assessments of the turbine. Before beginning the
        analysis, two major problems with current unsteady analytical
        capabilities had to be addressed: modeling a straight centerline
        nozzle with the turbineblades and exit guide vanes (EGVs), and
        reducing run times significantly while maintaining physical
        accuracy. Modifications were made to the CFD codeused in this
        study to allow the coupled nozzle/blade/EGV analysis and to
        incorporate Message Passing Interface (MPI) software. Because
        unsteadiness is a key issue for the Fastrac turbine [and future
        rocket engine turbines such as the Reusable Launch Vehicle
        (RLV)], calculations were performed for two nozzle-to-blade
        axial gaps. Calculations were also performed for the nozzle
        alone, and the results were imposed as an inlet boundary
        condition for a blade/EGV calculation for the large gap case.
        These results are compared to the nozzle/blade/EGV results.}
}

@Article{des01:mpi-app,
  author = 	 {J. C. Desplat and I. Pagonabarraga and P. Bladon},
  title = 	 {{LUDWIG: A} parallel {L}attice-{B}oltzmann code for complex
                  fluids}, 
  journal = 	 {Computer Physics Communications},
  year = 	 2001,
  volume =	 134,
  number =	 3,
  pages =	 {273--290},
  month =	 MAR,
  Abstract = {This paper describes Ludwig, a versatile code for the
                  simulation  
        of Lattice-Boltzmann (LB) models in 3D on cubic lattices. In
        fact, Ludwig is not a single code, but a set of codes that share
        certain common routines, such as I/O and communications. If
        Ludwig is used as intended, a variety of complex fluid models
        with different equilibrium free energies are simple to code, so
        that the user may concentrate on the physics of the problem,
        rather than on parallel computing issues. Thus far, Ludwig's
        main application has been to symmetric binary fluid mixtures. We
        first explain the philosophy and structure of Ludwig which is
        argued to be a very effective way of developing large codes for
        academic consortia. Next we elaborate on some parallel
        implementation issues such as parallel I/O, and the use of MPI
        to achieve full portability and good efficiency on both MPP and
        SMP systems. Finally, we describe how to implement generic solid
        boundaries, and look in detail at the particular case of a
        symmetric binary fluid mixture near a solid wall. We present a
        novel scheme for the thermodynamically consistent simulation of
        wetting phenomena, in the presence of static and moving solid
        boundaries, andcheck its performance.}
}

@Article{tan00:mpi-impl,
  author = 	 {H. Tang and K. Shen and T. Yang},
  title = 	 {Program transformation and runtime support for threaded
                  {MPI} execution on shared-memory machines},
  journal = 	 {ACM Transactions on Programming Languages and Systems},
  year = 	 2000,
  volume =	 22,
  number =	 4,
  pages =	 {673--700},
  month =	 JUL,
  Abstract = {Parallel programs written in MPI have been widely used for
        developing high-performance applications on various platforms.
        Because of a restriction of the MPI computation model,
        conventional MPI implementations on shared-memory machines map
        each MPI, node to an OS process, which can suffer serious
        performance degradation in the presence of multiprogramming,
        This paper studies compile-time and runtime techniques for
        enhancing performance portability of MPI code running on
        multiprogrammed shared-memory machines. The proposed techniques
        allow MPI nodes to be executed safely and efficiently as
        threads. Compile-time transformation eliminates global and
        static variables in C code using node-specific data. The runtime
        support includes an efficient and provably correct communication
        protocol that uses lock-free data structure and takes advantage
        of address space sharing among threads. The experiments on SGI
        Origin 2000 show that our MPI prototype called TMPI using the
        proposed techniques is competitive with SGI's native MPI
        implementation in adedicated environment, and that it has
        significant performance advantages in a multiprogrammed
        environment.}
}

@Article{dim01:mpi-app,
  author = 	 {I. Dimov and V. Alexandrov and A. Karaivanova},
  title = 	 {Parallel resolvent Monte Carlo algorithms for linear algebra
                  problems}, 
  journal = 	 {Mathematics and Computers in Simulation},
  year = 	 2001,
  volume =	 55,
  number =	 {1-3},
  pages =	 {25--35},
  month =	 FEB,
  abstract = {In this paper, we consider Monte Carlo (MC) algorithms based on
        the use of the resolvent matrix for solving linear algebraic
        problems. Estimates for the speedup and efficiency of the
        algorithms are presented. Some numerical examples performed on
        cluster of workstations using MPI are given. }
}

@Article{luCai01:mpi-app,
  author = 	 {Q. M. Lu and D. S. Cai},
  title = 	 {Implementation of parallel plasma particle-in-cell codes on
                  {PC} cluster},  
  journal = 	 {Computer Physics Communications},
  year = 	 2001,
  volume =	 135,
  number =	 1,
  pages =	 {93--104},
  month =	 MAR,
      Abstract = {Plasma particle-in-cell (PIC) codes model the interaction of
        charged particles with the surrounding fields, and they have
        been implemented on many advanced parallel computers. Recently,
        many PC clusters which consist of inexpensive PCs have been
        developed to do parallel computing, and we also build such a PC
        cluster. In this paper, we present the implementation of a
        parallel plasma PIC code on our PC cluster using MPI, PGHPF and
        JavaMPI.}
}

@Article{yas01:mpi-app,
  author = 	 {O. Yasar},
  title = 	 {A new ignition model for spark-ignited engine simulations},
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 {1--2},
  pages =	 {179--200},
  month =	 JAN,
  abstract = {The amount of spark energy deposited into the combustion chamber
        is key to an optimum ignition as one can end up with misfires
        when this energy is lowor with other undesired effects on engine
        performance and byproducts when it is high, Experimentally, up
        to now, no one has been able to correlate the combustion outcome
        accurately to the spark parameters in a controllable way.
        Theoretical investigation and computer modeling is leading to a
        better understanding of how spark flames propagate. A new
        computational approach to ignition dynamics is presented here
        for spark-ignited (SI) engine combustion simulations. Our
        computational model, using the MPI communication library,
        attempts to solve temporal and spatial equations of the
        electromagnetic(EM) equations in conjunction with the well-known
        Navier-Stokes equations of the standard KIVA-3 engine code. The
        interaction between the gas and theflame (plasma) kernel in the
        spark region is computed through the momentumand energy exchange
        between these two fields, Preliminary results show a distinct
        spatial distribution of physical quantities at the flame front
        and within the inflammation zone. A slight change in the spark
        discharge current has significant impact on the combustion and
        emissions. Enhanced accuracyof spark ignition modeling might
        help us better compute the early flame propagation and its
        influence on the cyclic variability of engines, potentially
        leading to design of new spark plugs. }
}


@Article{lin01:mpi-graphics,
  author = 	 {W. S. Lin and R. W. H. Lau and K. Hwang and X. L. Lin and
                  P. Y. S. Cheung},  
  title = 	 {Adaptive parallel rendering on multiprocessors and
                  workstation clusters}, 
  journal = 	 {IEEE Transactions on Parallel and Distributed Systems},
  year = 	 2001,
  volume =	 12,
  number =	 3,
  pages =	 {241--258},
  month =	 MAR,
  abstract = {This paper presents the design and performance of a new parallel
        graphics renderer for 3D images. This renderer is based on an
        adaptive supersampling approach that works for
        time/space-efficient execution on two classes of parallel
        computers. Our rendering scheme takes subpixel supersamples only
        along polygon edges. This leads to a significant reduction in
        rendering time and in buffer memory requirements. Furthermore,
        we offer a balanced rasterization of all transformed polygons.
        Experimental results prove these advantages on both a
        shared-memory SGI multiprocessor server and a Unix cluster ofSun
        workstations. We reveal performance effects of the new rendering
        scheme on subpixel resolution, polygon number, scene complexity,
        and memory requirements. The balanced parallel renderer
        demonstrates scalable performance with respect to increase in
        graphic complexity and in machine size. Our parallel renderer
        outperforms Crow's scheme in benchmark experiments performed.
        The improvements are made in three fronts: 1) reduction in
        rendering time, 2) higher efficiency with balanced workload, and
        3) adaptive to availablebuffer memory size. The balanced
        renderer can be more cost-effectively embedded within many 3D
        graphics algorithms, such as those for edge smoothing and 3D
        visualization. Our parallel renderer is MPI-coded, offering high
        portability and cross-platform performance. These advantages can
        greatly improve the QoS in 3D imaging and in real-time
        interactive graphics.}
}

@Article{got01:mpi-openmp-app,
  author = 	 {S. Gottlieb and S. Tamhankar},
  title = 	 {Benchmarking {MILC} code with {OpenMP} and {MPI}},
  journal = 	 {Nuclear Physics B-Proceedings Supplements},
  year = 	 2001,
  number =	 94,
  pages =	 {841--845},
  month =	 MAR,
  abstract = {A trend in high performance computers that is becoming
        increasingly popularis the use of symmetric multiprocessing
        (SMP) rather than the older paradigm of MPP. MPI codes that ran
        and scaled well on MPP machines can often be run on an SR;IP
        machine using the vendor's version of MPI. However, this
        approach may not make optimal use of the (expensive) SMP
        hardware. More significantly, there are machines like Blue
        Horizon, an IBM SP with 8-way. SMP nodes at the San Diego
        Supercomputer Center that carl only support 4 MPI processes per
        node (with the current switch). On such a machine it is
        imperative to be able to use OpenMP parallelism on the node, and
        MPI between nodes.We describe the challenges of converting MILC
        MPI code to using a second level of OpenMP parallelism, and
        benchmarks on IBM and Sun computers.}
}


@Article{cha00:mpi-app,
  author = 	 {T. Chan and V. Eijkhout},
  title = 	 {Design of a library of parallel preconditioners},
  journal = 	 {International Journal of High Performance Computing
                  Applications}, 
  year = 	 2000,
  volume =	 14,
  number =	 2,
  pages =	 {91--101},
  month =	 {Summer},
  abstract = {The authors outline the design principles underlying the ParPre
        library of parallel preconditioners. ParPre is a message-passing
        library of distributed preconditioners for linear systems,
        written using MPI and Petsc. It comprises Schwarz methods, Schur
        system domain decompositioning, various parallel incomplete
        factorizations, and multilevel methods.}
}


@Article{gro00:mpi-app,
  author = 	 {W. Gropp and D. Keyes and L. C. McInnes and M. D. Tidriri},
  title = 	 {Globalized {N}ewton-{K}rylov-{S}chwarz algorithms and
                  software for parallel implicit {CFD}},
  journal = 	 {International Journal of High Performance Computing
                  Applications}, 
  year = 	 2000,
  volume =	 14,
  number =	 2,
  pages =	 {102--136},
  month =	 {Summer},
  abstract = {Implicit solution methods are important in applications modeled
        by PDEs with disparate temporal and spatial scales. Because such
        applications require high resolution with reasonable turnaround,
        parallelization is essential. The pseudo-transient matrix-free
        Newton-Krylov-Schwarz (Psi NKS) algorithmicframework is
        presented as a widely applicable answer. This article shows that
        for the classical problem of three-dimensional transonic Euler
        flow about an M6 wing, Psi NKS can simultaneously deliver
        globalized, asymptotically rapid convergence through adaptive
        pseudo-transient continuation and Newton's method; reasonable
        parallelizability for an implicit method through deferred
        synchronization and favorable communication-to-computation
        scaling in the Krylov linear solver; and high per processor
        performance through attention to distributed memory and cache
        locality, especially through the Schwarz preconditioner. Two
        discouraging features of Psi NKS methods are their sensitivity
        to the coding of the underlying PDE discretization and the large
        number of parameters that must be selected to govern
        convergence. The authors therefore distill several
        recommendations from their experience and reading of the
        literature on various algorithmic components of Psi NKS, and
        they describe a freely available MPI-based portable parallel
        software implementation of the solver employed here.}
}


@Article{man01:mpi-app-perf,
  author = 	 {J. W. Manke and G. D. Kerlick and D. Levine and S. Banerjee
                  and E. Dillon},
  title = 	 {Parallel performance of two applications in the {B}oeing
                  high performance computing benchmark suite},
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 4,
  pages =	 {457--475},
  month =	 MAR,
  abstract = {We describe our work to evaluate the performance of the parallel
        versions of two floating-point-intensive engineering
        applications from Boeing's high performance computing benchmark
        suite (BHPCBS) on emerging RISC parallel systems and PC
        clusters. The first application is a computational fluid
        dynamics (CFD) code, OVERFLOW, developed by NASA. and used by
        Boeing for analysis and design of advanced aircraft. The second
        application is a prototype ofa computational electromagnetics
        (CEM) code, developed by Boeing and used for radar cross-section
        studies. The distributed memory parallel versions of both
        applications use the message passing interface (MPI) standard
        for message passing. The goal of our work was to determine
        whether RISC parallel systems and PC clusters, which offer high
        performance at low cost, may be able to meet Boeing's computing
        requirements in the future. We describe the test environments
        for the studies, discuss parallelization issues and strategies
        and present performance data for the two applications.}
}


@Article{bag01:mpi-perf,
  author = 	 {R. Bagrodia and E. Deelman and T. Phan},
  title = 	 {Parallel simulation of large-scale parallel applications},
  journal = 	 {International Journal of High Performance Computing
                  Applications},
  year = 	 2001,
  volume =	 15,
  number =	 1,
  pages =	 {3--12},
  month =	 {Spring},
  abstract = {Accurate and efficient simulation of large parallel applications
        can be facilitated with the use of direct execution and parallel
        discrete-event simulation. This paper describes MPI-SIM, a
        direct execution-driven parallel simulator designed to predict
        the performance of existing MPI and MPI-IO application. MPI-SIM
        can be used to predict the performance of these programs asa
        function of architectural characteristics, including number of
        processors, message communication latencies, caching algorithms,
        and alternative implementations of collective I/O operations.
        Results are presented, which show the use of MPI-SIM in
        performing a scalability study of real-world applications. The
        benchmarks chosen for the study include Sweep3D, one of the ASCI
        benchmarks, and BTIO, an I/O-intensive benchmark from the NAS
        Parallel Benchmark suite. MPI-SIM is shown to accurately and
        efficiently predict the performance of Sweep3D running on an
        Origin 2000. It is also used to demonstrate the impact of the
        number of I/O nodes on BTIO's performance.}
}


@Article{hoe01:mpi-openmp,
  author = 	 {J. Hoeflinger and P. Alavilli and T. Jackson and B. Kuhn},
  title = 	 {Producing scalable performance with {OpenMP}: {E}xperiments
                  with two {CFD} applications},
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 4,
  pages =	 {391--413},
  month =	 MAR,
  abstract = {OpenMP is a relatively new programming paradigm, which can
        easily deliver good parallel performance for small numbers ($<16$)
        of processors. Success with more processors is more difficult to
        produce. MPI is a relatively mature programming paradigm, and
        there have been many reports of highly scalable MPI codes for
        large numbers (hundreds, even thousands) of processors. In this
        paper, we explore the causes of poor scalability with OpenMP
        from two points of view. First, we incrementally transform the
        loops in a combustion application until we achieve reasonably
        good parallel scalability, and chronicle the effect of each
        step. Then, we approach scalability from the other direction by
        transforming a highly scalable program simulating the core
        flowof a solid-fuel rocket engine (originally written with MPI
        calls) directlyto OpenMP, and report the barriers to scalability
        that were detected. The list of incremental transformations
        includes well-known techniques such as loop interchange and loop
        fusion, plus new ones which make use of the unique features of
        OpenMP, such as barrier removal and the use of ordered
        serialloops. The list of barriers to scalability includes the
        use of the ALLOCATE statement within a parallel region, as well
        as the lack of a reduction clause for a PARALLEL region in
        OpenMP. We conclude with a list of key issueswhich need to be
        addressed to make OpenMP a more easily scalable paradigm.Some of
        these are OpenMP implementation issues; some are language
        issues.}
}

@Article{wal01:mpi-app,
  author = 	 {R. L. Walker},
  title = 	 {Search engine case study: searching the web using genetic
                  programming and {MPI}},
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 1,
  pages =	 {71--89},
  month =	 JAN,
  abstract = {The generation of a Web page follows distinct sources for the
        incorporationof information. The earliest format of these
        sources was an organized display of known information determined
        by the page designers' interest and/or design parameters. The
        sources may have been published in books or other printed
        literature, or disseminated as general information about the
        page designer. Due to a growth in Web pages, several new search
        engines have been developed in addition to the refinement of the
        already existing ones. The use of the refined search engines,
        however, still produces an array of diverse information when the
        same set of keywords are used in a Web search. Some degree of
        consistency in the search results can be achieved over a period
        of time when the same search engine is used, yet, most initial
        Web searches on a given topic are treated as final after some
        form of refinement/adjustment of the keywords used in the search
        process. To determine the applicability of a genetic programming
        (GP) model for the diverse set of Web documents, search
        strategies behind the current search engines for the World Wide
        Web were studied. The development of a GP model resulted in a
        parallel implementation of a pseudosearch engine indexer
        simulator. The training sets used in this study provided a small
        snapshot of the computational effort required to index Web
        documents accurately and efficiently. Future results will be
        used to develop and implement Web crawler mechanisms that are
        capable of assessing the scope of this research effort, The GP
        model results were generated on a network of SUN workstations
        and an IBM SP2.}
}


@Article{dij01:mpi-app,
  author = 	 {F. Dijkstra and J. H. van Lenthe},
  title = 	 {Software news and updates - Parallel valence bond},
  journal = 	 {Journal of Computational Chemistry},
  year = 	 2001,
  volume =	 6,
  number =	 22,
  pages =	 {665--672},
  month =	 APR,
  abstract = {A parallel version of the valence bond program TURTLE has been
        developed. In this version the calculation of matrix elements is
        distributed over the processors. The implementation has been
        done using the message-passing interface (MPI), and is,
        therefore, portable. The parallel version of the program is
        shown to be quite efficient with a speed-up of 55 at 64
        processors.}
}


@Article{den01:mpi-sys,
  author = 	 {Y. F. Deng and A. Korobka},
  title = 	 {The performance of a supercomputer built with commodity
                  components}, 
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 {1--2},
  pages =	 {91--108},
  month =	 JAN,
  abstract = {We built a supercomputer called Galaxy by connecting Intel
        Pentium-based computer nodes with Fast and Gigabit Ethernet
        switches. Each node has two processors at clock speeds varying
        from 300 to 600 MHz, up to 512 MB of memory, and small 2 Gb
        local disk. All nodes run the standard RedHat Linux and
        inter-node communication is handled by a message passing
        interface called MPI. Local tools are written to visualize the
        system performance and to balance loads. We have benchmarked a
        sub-Galaxy with 72 processors by NAS and Parallel LINPACK
        benchmark suites. We achieved 16.9 Gflops in a standard single
        precision LU decomposition for 46848 x 46838 matrix parallel
        LINPACK benchmark. A Galaxy with 128 processors costs
        approximately \$250 000 and it delivers 40 Gflops of performance.
        This leads to a cost-performance ratio of 160 Kflops-per-dollar,
        which is to improve further due to increase in processor speeds
        and network bandwidth at similar cost. Our final system with
        512 processors is expected to reach several Tflops. This article
        first describes the Galaxy architectural details, and then
        present and analyze its performance in terms of floating point
        number crunching, network bandwidth, and IO throughput.}
}


@Article{cap01:mpi-smp-perf,
  author = 	 {F. Cappello and O. Richard and D. Etiemble},
  title = 	 {Understanding performance of {SMP} clusters running {MPI}
                  programs}, 
  journal = 	 {Future Generation Computer Systems},
  year = 	 2001,
  volume =	 17,
  number =	 6,
  pages =	 {711-720},
  month =	 APR,
  abstract = {Clusters of multiprocessors (CLUMPs) have an hybrid memory
        model, with message passing between nodes and shared memory
        inside nodes. We examine the performance of Myrinet clusters of
        SMP PCs when using a single memory model (SMM) based on the
        MPICH-PM/CLUMP library of the RWCP, which can directly use the
        MPI programs written for a cluster of uniprocessors. The
        specificities of the communication patterns with the SMM
        approach are detailed. PC clusters with 2-way and 4-way nodes
        are considered and compared.}
}


@Article{he01:mpi-app,
  author = 	 {Y. He and C. H. Q. Ding},
  title = 	 {Using accurate arithmetics to improve numerical
                  reproducibility and stability in parallel applications},
  journal = 	 {Journal of Supercomputing},
  year = 	 2001,
  volume =	 18,
  number =	 3,
  pages =	 {259--277},
  month =	 MAR,
  abstract = {Numerical reproducibility and stability of large scale
        scientific simulations, especially climate modeling, on
        distributed memory parallel computers are becoming critical
        issues. In particular, global summation of distributedarrays is
        most susceptible to rounding errors, and their propagation and
        accumulation cause uncertainty in final simulation results. We
        analyzed several accurate summation methods and found that two
        methods are particularly effective to improve (ensure)
        reproducibility and stability: Kahan's self-compensated
        summation and Bailey's double-double precision summation. We
        provide an MPI operator MPI_SUMDD to work with MPI collective
        operations to ensure a scalable implementation on large number
        of processors. The final methods are particularly simple to
        adopt in practical codes: not only global summations, but also
        vector-vector dot products and matrix-vector or matrix-matrix
        operations.}
}

@Article{pro01:mpi-impl,
  author = 	 {B. V. Protopopov and A. Skjellum},
  title = 	 {A multithreaded message passing interface ({MPI})
                  architecture: Performance and program issues}, 
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2001,
  volume =	 61,
  number =	 4,
  pages =	 {449--466},
  month =	 APR,
  Abstract = {This paper discusses a multithreaded software architecture for
        message-passing interface (MPI) software specification. The
        architecture is thread-safe, allows for concurrent communication
        over several communications media (multifabric communication),
        efficiently utilizes available hardware concurrency over a wide
        range of target platforms, and allows for concurrent
        communication and computation within the limits imposed by the
        hardware. The architecture is developed in the framework of the
        MPICH software architecture, awell-known MPI implementation used
        worldwide. The proposed architecture adopts wide portability of
        the MPICH design and remedies some of its deficiencies such as
        inefficient multifabric communication and non-thread-safety. The
        paper also considers the issues concerning development of
        high-performance portable message-passing systems for
        general-purpose architectures. The contributions of the paper
        are improving architecture and addressing threadsafely of modern
        reliable messaging software, as well as identifying and taking
        advantage of inherent concurrency in the message-passing
        software itself.}
}


@Article{cun01:mpi-app,
  author = 	 {R. D. da Cunha and A. L. de Bortoli},
  title = 	 {A parallel {N}avier-{S}tokes solver for the rotating flow
                  problem}, 
  journal = 	 {Concurrency and Computation-Practice \& Experience},
  year = 	 2001,
  volume =	 13,
  number =	 3,
  pages =	 {163--180},
  month =	 MAR,
  abstract = {In this paper, we investigate the parallel solution of rotating
        internal flow problems, using the Navier-Stokes equations as
        proposed by Speziale and Thangam (in 1983) and Speziale (in
        1985), A Runge-Kutta time-stepping scheme was applied to the
        equations and both sequential and message-passing
        implementations were developed, the latter using MPI, and were
        tested on a four-processor SGI Origin200 distributed, global
        shared memory parallel computer and on a 32-processor IBM 9076
        SP/2 distributed memory parallel computer.The results show that
        our approach to parallelize the sequential implementation
        requires little effort whilst providing good results even for
        medium-sized problems.}
}

@Article{swan01:mpi-app,
  author = 	 {C. A. Swann},
  title = 	 {Software for parallel computing: The {LAM} implementation of
                  {MPI}}, 
  journal = 	 {Journal of Applied Econometrics},
  year = 	 2001,
  volume =	 16,
  number =	 2,
  pages =	 {185--194},
  month =	 {Mar-Apr},
  abstract = {Many econometric problems can benefit from the application of
        parallel computing techniques, and recent advances in hardware
        and software have made such application feasible. There are a
        number of freely available software libraries that make it
        possible to write message passing parallel programs using
        personal computers or Unix workstations. This review discusses
        one of these-the LAM (Local Area Multiprocessor) implementation
        of MPI (the MessagePassing Interface).}
}


@Article{reis01:mpi-app,
  author = 	 {T. G. Reisin and S. C. Wurzler},
  title = 	 {Implementation of a numerical solution of the multicomponent
                  kinetic collection equation ({MKCE}) on parallel computers},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2001,
  volume =	 61,
  number =	 5,
  pages =	 {641--661},
  month =	 MAY,
  abstract = {Two different numerical solutions of the two-component kinetic
        collection equation were implemented on parallel computers. The
        parallelization approach included domain decomposition and MPI
        commands for communications. Four different parallel codes were
        tested. A dynamic decomposition based on an occupancy function
        provided the optimum balance between time performance and
        flexibility for ally number of processors. The occupancy
        function was defined according to the number of calculations
        required at each grid point in the domain. Speed-up performance
        depended very much on the parallel code used and in some cases
        very good results were obtained for up to 32 processors.}
}

@Article{guif01:mpi-app,
  author = 	 {C. Guiffaut and K. Mahdjoubi},
  title = 	 {A parallel {FDTD} algorithm using the {MPI} library},
  journal = 	 {IEEE Antennas and Propagation Magazine},
  year = 	 2001,
  volume =	 43,
  number =	 2,
  pages =	 {94--103},
  month =	 APR,
  abstract = {In this paper, we describe the essential elements of a parallel
        algorithm for the FDTD method using the MPI (Message Passing
        Interface) library. To simplify and accelerate the algorithm, an
        MPI Cartesian 2D topology is used. The inter-process
        communications are optimized by the use of derived data types. A
        general approach is also explained for parallelizing the
        auxiliary tools, such as far-field computation, thin-wire
        treatment, etc. For PMLs, we have used a new method that makes
        it unnecessary to split the field components. This considerably
        simplifies the computer programming, and is compatible with the
        parallel algorithm.}
}

@Article{yao01:mpi-app,
  author = 	 {J. X. Yao and A. Jameson and J. J. Alonso and F. Liu},
  title = 	 {Development and validation of a massively parallel flow
                  solver for turbomachinery flows},
  journal = 	 {Journal of Propulsion and Power},
  year = 	 2001,
  volume =	 17,
  number =	 3,
  pages =	 {659--668},
  month =	 {May-June},
  abstract = {The development and validation of the unsteady,
        three-dimensional, multiblock, parallel turbomachinery how
        solver TFLO is presented, The unsteady Reynolds-averaged
        Navier-Stokes equations are solved using a cell-centered
        discretization on arbitrary multiblock meshes. The solution
        procedure is based on efficient explicit Runge-Kutta methods
        with several convergence acceleration techniques such as
        multigrid, implicit residual smoothing, and local time stepping.
        The solver is parallelized using domain decomposition, a single
        program multiple data strategy, and the message passing
        interface standard, Details of the communication scheme and load
        balancing algorithms are discussed. A general and efficient
        procedure for parallel interblade row interfacing is developed.
        The dual-time stepping technique is used to advance unsteady
        computations in time, The focus is on improving the parallel
        efficiency and scalability of the flow solver, as well as on its
        initial validation of steady-state calculations in multiblade
        row environment. The result of this careful implementation is a
        solver with demonstrated scalability upto 1024 processors. For
        validation and verification purposes, results fromTFLO are
        compared with both existing experimental data and computational
        results from other computational fluid dynamics codes used in
        aircraft engine industry.}
}
 
        
@Article{kom01:mpi-app,
  author = 	 {Y. Komeiji and M. Haraguchi and U. Nagashima},
  title = 	 {Parallel molecular dynamics simulation of a protein},
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 8,
  pages =	 {977-987},
  month =	 JUL,
  abstract = {
        Program for energetic analysis of biochemical molecules (PEACH)
        is a software package for molecular dynamics (MD) simulation of
        biological molecules. The subroutines for the nonbonded
        interactions were modified to allow parallel computation by
        using the MPI library. The parallel efficiencies of the modified
        subroutines were close to 90\% or better when using 32 processors
        of an IBM SP computer. The total performance was comparable to
        that of the special-purpose computer MD-GRAPE with 8 LSI chips.
  }
}


@Article{mar01:mpi-style,
  author = 	 {B. Di Martino and A. Mazzeo and N. Mazzocca and U. Villano},
  title = 	 {Parallel program analysis and restructuring by detection of
    point-to-point interaction patterns and their transformation into
    collective communication constructs},
  journal = 	 {Science of Computer Programming},
  year = 	 2001,
  volume =	 40,
  number =	 {2-3},
  pages =	 {235--263},
  month =	 JUL,
  abstract = {
        After the presentation of the basic program analysis technique,
        several examples involving the detection of common communication
        patterns are shown. Then the structure of PPAR, a prototype tool
        that allows the analysis of parallel programs written in Fortran
        77 with calls to PVM or MPI unstructured communication
        primitives is outlined, and conclusions are drawn.}
}


@Article{lee01:mpi-app,
  author = 	 {M. Lee and W. Liu and V. K. Prasanna},
  title = 	 {Parallel implementation of a class of adaptive signal
                  processing applications},
  journal = 	 {Algorithmica},
  year = 	 2001,
  volume =	 30,
  number =	 4,
  pages =	 {645--684},
  month =	 AUG,
  abstract = {In this paper we present a methodology for mapping a class of
        adaptive signal processing applications onto KPC platforms such
        that the throughput performance is optimized. We first define a
        new task model using the salient computational characteristics
        of a class of adaptive signal processing applications. Based on
        this task model, we propose a new execution model. In the
        earlier linear pipelined execution model, the task mapping
        choices were restricted. The new model permits flexible task
        mapping choices, leading to improved throughput performance
        compared with the previous model. Using the new model, a
        three-step task mapping methodology is developed. It consists of
        (1) a data remapping step, (2) a coarse resource allocation
        step, and (3)a fine performance tuning step. The methodology is
        demonstrated by designing parallel algorithms for modern radar
        and sonar signal processing applications. These are implemented
        on IBM SP2 and Cray T3E, state-of-the-art HPC platforms, to show
        the effectiveness of our approach. Experimental results show
        significant performance improvement over those obtained by
        previous approaches. Our code is written using C and the Message
        Passing Interface (MPI). Thus, it is portable across various HPC
        platforms.}
}


@Article{take01:mpi-eval,
  author = 	 {K. Takeda and N. K. Allsopp and J. C. Hardwick and
                  P. C. Macey and D. A. Nicole and S. J. Cox and
                  D. J. Lancaster}, 
  title = 	 {An assessment of {MPI} environments for windows {NT}},
  journal = 	 {Journal of Supercomputing},
  year = 	 2001,
  volume =	 19,
  number =	 3,
  pages =	 {315--323},
  Abstract = {In this paper we evaluate the MPI environments currently
        available for Windows NT on the Intel IA32 and Compaq/DEC Alpha
        architectures. We present benchmark results for low-level
        communication and for the NAS Parallel Benchmarks to allow
        comparison with other systems, but our primary interest is
        determining real application performance and robustness in
        production cluster environments. For this we use PAFEC-FE, a
        large FORTRAN code for finite-element analysis. We present
        results from three MPI implementations, two architectures, and
        three networking technologies (10 and 100 Mbit/s Ethernet and 1
        Gbit/s Myrinet).}
}


@Article{chun01:mpi-app,
  author = 	 {S. H. Chung and H. C. Kwon and K. R. Ryu and Y. Chung and
                  H. Jang and C. A. Choi},  
  title = 	 {Information retrieval on an {SCI}-based {PC} cluster},
  journal = 	 {Journal of Supercomputing},
  year = 	 2001,
  volume =	 19,
  number =	 3,
  pages =	 {251--265},
  Abstract = {This article presents an efficient parallel information
        retrieval (IR) system which provides fast information service
        for the Internet users on low-cost high-performance PC-NOW
        environment. The IR system is implemented on a PC cluster based
        on the scalable coherent interface (SCI), a powerful
        interconnecting mechanism for both shared memory models and
        message-passing models. In the IR system, the inverted-index
        file (IIF) is partitioned into pieces using a greedy
        declustering algorithm and distributed to the cluster nodes to
        be stored on each node's hard disk. For each incoming user's
        query with multiple terms, terms are sent to the corresponding
        nodes which contain the relevant pieces of the IIF to be
        evaluated in parallel. The IR system is developed using a
        distributed-shared memory (DSM) programming technique based on
        the SCI. According to the experiments, the IR system outperforms
        anMPI-based IR system using Fast Ethernet as an interconnect.
        Speed-up of upto 5.0 was obtained with an 8-node cluster in
        processing each query on a 500,000-document IIF.}
}


@Article{pic01:mpi-app,
  author = 	 {S. M. Pickles and J. M. Brooke and F. C. Costen and
                  E. Gabriel and M. Muller and M. Resch and S. M. Ord}, 
  title = 	 {Metacomputing across intercontinental networks},
  journal = 	 {Future Generation Computer Systems},
  year = 	 2001,
  volume =	 17,
  number =	 8,
  pages =	 {911--918},
  month =	 JUN,
      Abstract = {An intercontinental network of supercomputers spanning more than
        10 000 miles and running challenging scientific applications was
        realized at the Supercomputing '99 (SC99) conference in
        Portland, OR using PACX-MPI and ATM PVCs. In this paper, we
        describe how we constructed the heterogeneous cluster of
        supercomputers, the problems we confronted in terms of
        multi-architecture and the way several applications handled the
        specific requirements of a metacomputer.}
}


@Article{sha01:mpi-model,
  author = 	 {H. Z. Shan and J. P. Singh},
  title = 	 {A comparison of {MPI}, {SHMEM} and cache-coherent shared
                  address space  programming models on a tightly-coupled
                  multiprocessors}, 
  journal = 	 {International Journal of Parallel Programming},
  year = 	 2001,
  volume =	 29,
  number =	 3,
  pages =	 {283--318},
  month =	 JUN,
  Abstract = {We compare the performance of three major programming models on
        a modern, 64-processor hardware cache-coherent machine, one of
        the two major types of platforms upon which high-performance
        computing is converging. We focus on applications that are
        either regular, predictable or at least do not require
        fine-grained dynamic replication of irregularly accessed data.
        Within this class, we use programs with a range of important
        communication patterns. We examine whether the basic parallel
        algorithm and communication structuring approaches needed for
        best performance are similar or different among the models,
        whether some models have substantial performance advantages over
        others as problem size and number of processors change, what the
        sources ofthese performance differences are, where the programs
        spend their time, and whether substantial improvements can be
        obtained by modifying either the application programming
        interfaces or the implementations of the programming models on
        this type of tightly-coupled multiprocessor platform.}
}


@Article{dem01:mpi-extension,
  author = 	 {E. D. Demaine and I. Foster and C. Kesselman and M. Snir},
  title = 	 {Generalized communicators in the message passing interface},
  journal = 	 {IEEE Transactions on Parallel and Distributed Systems},
  year = 	 2001,
  volume =	 16,
  number =	 6,
  pages =	 {610--616},
  month =	 JUN,
  abstract = {We propose extensions to the Message Passing Interface (MPI)
        that generalize the MPI communicator concept to allow multiple
        communication endpoints per process, dynamic creation of
        endpoints, and the transfer of endpoints between processes. The
        generalized communicator construct can be used to express a wide
        range of interesting communication structures, including
        collective communication operations involving multiple threads
        per process, communications between dynamically created threads
        or processes, and object-oriented applications in which
        communications are directed to specific objects. Furthermore,
        this enriched functionality can be provided in a manner that
        preserves backward compatibility with MPI. We describe the
        proposed extensions, illustrate their use with examples, and
        describe a prototype implementation in the popular MPI
        implementation MPICH.}
}


@Article{tro01:mpi-app,
  author = 	 {R. Trobec and M. Sterk and M. Praprotnik and D. Janezic},
  title = 	 {Implementation and evaluation of {MPI}-based parallel {MD}
                  program}, 
  journal = 	 {International Journal of Quantum Chemistry},
  year = 	 2001,
  volume =	 84,
  number =	 1,
  pages =	 {23--31},
  month =	 JUL,
  abstract = {The message-passing interface (MPI)-based object-oriented
        particle-particleinteractions (PPI) library is implemented and
        evaluated. The library can be used in the ii-particle simulation
        algorithm designed for a ring of p interconnected processors.
        The parallel simulation is scalable with the numberof
        processors, and has the time requirement proportional to n(2)/p
        if n/p is large enough, which guarantees optimal speedup. III a
        certain range of problem sizes, the speedup becomes superlinear
        because enough cache memory is available in the system. The
        library is used in a simple way by any potential user, even with
        no deep programming knowledge. Different simulations using
        particles can be implemented on a wide spectrum of different
        computer platforms. The main purpose of this article is to test
        the PPI library on well-known methods, e.g., the parallel
        molecular dynamics (MD) simulation ofthe monoatomic system by
        the second-order leapfrog Verlet algorithm. The performances of
        the parallel simulation program implemented with the proposed
        library are competitive with a custom-designed simulation code.
        Also, theimplementation of the split integration symplectic
        method, based on the analytical calculation of the harmonic part
        of the particle interactions, is shown, and its expected
        performances are predicted.}
}


@Article{ahm01:mpi-alg,
  author = 	 {I. Ahmad},
  title = 	 {A distributed algorithm for finding prime compatibles on
                  network of workstations},
  journal = 	 {Microprocessors and Microsystems},
  year = 	 2001,
  volume =	 25,
  number =	 4,
  pages =	 {195--202},
  month =	 JUN,
  abstract = {State minimization of incompletely specified finite state
        machines (FSMs)isan important step of FSM synthesis. Generation
        of prime compatibles is oneof the core steps in state
        minimization of incompletely specified FSMs. Itis guaranteed
        that a minimal solution exist, consisting of prime compatibles
        only. But the generation of prime compatibles is both a
        compute-intensive and a memory-intensive problem. In this paper,
        we have developed and implemented a distributed algorithm,
        designated as D_Prime, to find prime compatibles on network of
        workstations (NOWs) under message passing interface (MPI)
        environment to handle the large complexity of VLSI designs in
        future. With the advent of high-speed networks and availability
        of powerful high-performance workstations, NOW has emerged as
        the most cost-effective platform for compute-intensive and
        memory-intensive applications. Comparison of results on a number
        of MCNC benchmarks for state minimization of incompletely
        specified FSMs showed that a considerable speedup can be
        achieved by the proposed distributed algorithm as compared with
        the existing sequential counterparts.}
}


@Article{ino01:mpi-model,
  author = 	 {F. Ino and N. Fujimoto and K. Hagihara},
  title = 	 {{LogGPS}: A parallel computational model for synchronization
                  analysis}, 
  journal = 	 {ACM SIGPLAN Notices},
  year = 	 2001,
  volume =	 36,
  number =	 7,
  pages =	 {133--142},
  month =	 JUL,
  abstract = {We also present some experimental results using both models. The
        results include (1) a verification of the LogGPS model, (2) an
        example of synchronization analysis using an MPI program and (3)
        a comparison of the models. The results indicate that the LogGPS
        model is more accurate than the LogGP model, and analyzing
        synchronization costs is important when improving
        parallel program performance.}
}
  
      
@Article{zha01:mpi-app,
  author = 	 {W. S. Zhang and G. Q. Zhang},
  title = 	 {Prestack depth migration by hybrid method with high
                  precision and its parallel implementation},
  journal = 	 {Chinese Journal of Geophysics-Chinese Edition},
  year = 	 2001,
  volume =	 44,
  number =	 4,
  pages =	 {542--551},
  month =	 JUL,
  abstract  =    {Prestack depth migration is an important imaging method for
        complex geological structures. In this paper a generalized
        system of wavefield continuation is presented based on the
        wavefield splitting theory. The system is coupled by downgoing
        and upgoing waves, and the commonly used equation of wavefield
        continuation is only a special case of the coupled system. Based
        on theapproximation of square root operator, a new hybrid
        migration method with high precision is derived. The method can
        be implemented numerically through splitting technique. Finally,
        two numerical migration examples are given,one is the poststack
        depth migration for a model with large lateral velocity
        contrasts, another is the prestack depth migration for Marmousi
        model with complex structures. The numerical results show the
        effectiveness and high precision of the method. The MPI parallel
        calculation is adopted in orderto raise computational
        efficiency. The method can be used to obtain precise images for
        complex structures with large lateral velocity variations.}
}



@Article{ant01:mpi-xxx,
  author = 	 {G. Antoniu and L. Bouge and P. Hatcher and M. MacBeth and
                  K. McGuigan and R. Namyst},
  title = 	 {The {H}yperion system: Compiling multithreaded {J}ava
                  bytecode for distributed execution},
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 10,
  pages =	 {1279--1297},
  month =	 SEP,
  abstract =     {Our work combines Java compilation to native code with a
        run-time library that executes Java threads in a distributed
        memory environment. This allows a Java programmer to view a
        cluster of processors as executing a single JAVA virtual
        machine. The separate processors are simply resources for
        executing Java threads with true parallelism, and the run-time
        system provides theillusion of a shared memory on top of the
        private memories of the processors. The environment we present
        is available on top of several UNIX systems and can use a large
        variety of communication interfaces thanks to the high
        portability of its run-time system. To evaluate our approach, we
        compare serial C, serial Java, and multithreaded Java
        implementations of a branch-and-bound solution to the
        minimal-cost map-coloring problem. All measurements have been
        carried out on two platforms using two different communication
        interfaces: SISCI/SCI and MPI-BIP/Myrinet.}
}

@Article{sar01:mpi-app,
  author = 	 {K. C. Sarma and H. Adeli},
  title = 	 {Bilevel parallel genetic algorithms for optimization of
                  large steel structures},
  journal = 	 {Computer-Aided Civil and Infrastructure Engineering},
  year = 	 2001,
  volume =	 16,
  number =	 5,
  pages =	 {295--304},
  month =	 SEP,
  abstract = {This article is concerned with optimization of very large steel
        structures subjected to the actual constraints of the American
        Institute of Steel Construction ASD and LRFD specifications on
        high-performance multiprocessor machines using biologically
        inspired genetic algorithms. First, parallel fuzzygenetic
        algorithms (GAs) are presented for optimization of steel
        structures using a distributed memory Message Passing Interface
        (MPI) with two different schemes: the processor farming scheme
        and the migration scheme. Next, two bilevel parallel GAs are
        presented for large-scale structural optimization through
        judicious combination of shared memory data parallel
        processingusing the OpenMP Application Programming Interface
        (API) and distributed memory message passing parallel processing
        using MPI. Speedup results are presented for parallel algorithms.}
}

@Article{yil01:mpi-app,
  author = 	 {E. Yilmaz, E and M. S. Kavsaoglu and H. U. Akay and
                  I. S. Akmandor}, 
  title = 	 {Cell-vertex based parallel and adaptive explicit 3{D} flow
                  solution on unstructured grids},
  journal = 	 {International Journal of Computational Fluid Dynamics},
  year = 	 2001,
  volume =	 14,
  number =	 4,
  pages =	 {271--286},
  abstract = {A parallel adaptive Euler flow solution algorithm is developed
        for 3D applications on distributed memory computers. Significant
        contribution of this research is the development and
        implementation of a parallel grid adaptationscheme together with
        an explicit cell vertex-based finite volume 3D flow solver on
        unstructured tetrahedral grids. Parallel adaptation of grids is
        based on grid-regeneration philosophy by using an existing
        serial grid generation program. Then, a general partitioner
        repartitions the grid. An adaptive sensor value, which is a
        measure to refine or coarsen grids, is calculated considering
        the pressure gradients in all partitioned blocks of grids. The
        parallel performance of the present study was tested. Parallel
        computations were performed on Unix workstations and a Linux
        cluster using MPI communication library. The present results
        show that overall adaptation scheme developed in this study is
        applicable to any pair of a flow solver and grid generator with
        affordable cost. It is also proved that parallel adaptation is
        necessary for accurate and efficient flow solutions.}
}


@Article{cha01:mpi-app,
  author = 	 {H. Y. Chang and K. C. Huang and C. Y. Shen and S. C. Tcheng
                  and C. Y. Chou},
  title = 	 {Parallel computation of a weather model in a cluster
                  environment}, 
  journal = 	 {Computer-Aided Civil and Infrastructure Engineering},
  year = 	 2001,
  volume =	 16,
  number =	 5,
  pages =	 {365--373},
  month =	 SEP,
  abstract =  {Recently, the superior and continuously improving
        cost-performance ratio ofcommodity hardware and software has
        made PC clustering a popular alternative for high-performance
        computing in both academic institutes and industrial
        organizations. The purpose of this work is to use PC clusters to
        solve a weather-prediction model in parallel mode, and the
        result also will be compared with those obtained on some
        conventional parallel platforms such as the Fujitsu VPP300, IBM
        SP2 (160 and 120 MHz), and HP SPP2200. Techniques of domain
        decomposition and data communication are used to exploit
        parallelismof the model. Interprocessor data communication is
        done by the Message Passing Interface communication library
        routines. Two versions of the parallelcodes, one with longitude
        decomposition and the other with latitude decomposition, are
        tested and compared. Speedups of the parallel weather model
        onthese machines with various numbers of processors show that
        substantial reductions in computation time can be achieved as
        compared with sequential runs.}
}

@Article{bgl00:mpi-impl,
  author = 	 {Ralph Butler and William Gropp and Ewing Lusk},
  title = 	 {Components and Interfaces of a Process Management System for
                  Parallel Programs},
  journal = 	 {Parallel Computing},
  month =        OCT,
  year = 	 2001,
  volume =	 27,
  number =       11,
  pages =	 {1417--1429},
  abstract =   {Parallel jobs are different from sequential jobs and require a
        different type of process management. We present here a process
        management system for parallel programs such as those written
        using MPI. A primary goal of the system, which we call MPD (for
        multipurpose daemon), is to be scalable. By this we mean that
        startup of interactive parallel jobs comprising thousands
        ofprocesses is quick, that signals can be quickly delivered to
        processes, and that stdin, stdout, and stderr are managed
        intuitively. Our primary target is parallel machines made up of
        clusters of SMPs, but the system is also useful in more tightly
        integrated environments. We describe how MPD enablesfast startup
        and convenient runtime management of parallel jobs. We show how
        close control of stdio can support the easy implementation of a
        number of convenient system utilities, even a parallel debugger.
        We describe a simple but general interface that can be used to
        separate any process manager from a parallel library, which we
        use to keep MPD separate from MPICH.}
}


@Article{myl01:mpi2-impl,
  author = 	 {S. Moh and C. S. Yu and B. Lee and H. Y. Youn and D. S. Han
                  and D. Lee}, 
  title = 	 {Four-ary tree-based barrier synchronization for 2{D} meshes
                  without  nonmember involvement},
  journal = 	 {IEEE Transactions on Computers},
  year = 	 2001,
  volume =	 50,
  number =	 8,
  pages =	 {811-823},
  month =	 AUG,
  abstract  =    {This paper proposes a Barrier Tree for Meshes (BTM) to
                  minimize 
        the barrier synchronization latency for two-dimensional (2D)
        meshes. The proposed BTM scheme has two distinguishing features.
        First, the synchronization tree is 4-ary. The synchronization
        latency of the BTM scheme is asymptotically Theta (log(4) n),
        while that of the fastest scheme reported in the literature is
        bounded between Omega (log(3) n) and O(n(1/2)), where n is the
        number of member nodes. Second, nonmember nodes are neither
        involved in the construction of a BTM nor actively participate
        in the synchronization operations, which avoids interference
        among different process groups during synchronization. This not
        only results in low setup overhead, but also reduces the
        synchronization latency. The low setup overhead is particularly
        effective for the dynamic process model provided in MPI-2.
        Extensive simulation study showsthat, for up to 64 x 64 meshes,
        the BTM scheme results in about 40 similarto 70 percent shorter
        synchronization latency and is more scalable than conventional
        schemes.}
}

 
@Article{fbd01:mpi-impl,
  author = 	 {G. E. Fagg and A. Bukovsky and J. J. Dongarra},
  title = 	 {{HARNESS} and fault tolerant {MPI}},
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 11,
  pages =	 {1479--1495},
  month =	 OCT,
  abstract = {Initial versions of MPI were designed to work efficiently on
        multi-processors which had very little job control and thus
        static process models. Subsequently forcing them to support a
        dynamic process model would have affected their performance. As
        current HPC systems increase in size with greater potential
        levels of individual node failure, the need arises for new fault
        tolerant systems to be developed. Here we present a new
        implementation of MPI called fault tolerant MPI (FT-MPI) that
        allows the semantics and associatedmodes of failures to be
        explicitly controlled by an application via a modified MPI API.
        Given is an overview of the FT-MPI semantics, design,
        exampleapplications, debugging tools and some performance
        issues. Also discussed is the experimental HARNESS core
        (G\_HCORE) implementation that FT-MPI is built to operate upon.}
}


@Article{kbg01:mpi-impl,
  author = 	 {T. Kielmann and H. E. Bal and S. Gorlatch and K. Verstoep
                  and R. F. H. Hofman},
  title = 	 {Network performance-aware collective communication for
                  clustered wide-area systems},
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 11,
  pages =	 {1431--1456},
  month =	 OCT,
  abstract = {Metacomputing infrastructures couple multiple clusters (or MPPs)
        via wide-area networks. A major problem in programming parallel
        applications for suchplatforms is their hierarchical network
        structure: latency and bandwidth of WANs often are orders of
        magnitude worse than those of local networks. Our goal is to
        optimize MPI's collective operations for such platforms. We use
        two techniques: selecting suitable communication graph shapes,
        and splitting messages into multiple segments that are sent in
        parallel over different WAN links. To optimize graph shape and
        segment size at runtime, we introduce a performance model called
        Parameterized Log P (P - Log P), a hierarchical extension of the
        Log P model that covers messages of arbitrary length. An
        experimental performance evaluation shows that the newly
        implemented collective operations have significantly improved
        performance for large messages, and that there is a close match
        between the theoretical model and the measured completion times.}
}


@Article{ll01:mpi-openmp,
  author = 	 {G. R. Luecke and W. H. Lin},
  title = 	 {Scalability and performance of {OpenMP} and {MPI} on a
                  128-processor {SGI} {Origin2000}},
  journal = 	 {Concurrency and Computation-Practice \& Experience},
  year = 	 2001,
  volume =	 13,
  number =	 10,
  pages =	 {905--928},
  month =	 AUG,
  abstract = {The purpose of this paper is to investigate the scalability and
        performanceof seven, simple OpenMP test programs and to compare
        their performance with equivalent MPI programs on an SGI Origin
        2000. Data distribution directives were used to make sure that
        the OpenMP implementation had the same data distribution as the
        MPI implementation. For the matrix-times-vector (test 5) and the
        matrix-times-matrix (test 7) tests, the syntax allowed in OpenMP
        1.1 does not allow OpenMP compilers to be able to generate
        efficient code since the reduction clause is not currently
        allowed for arrays. (This problem is corrected in OpenMP 2.0.)
        For the remaining five tests, the OpenMP version performed and
        scaled significantly better than the corresponding MPI
        implementation, except for the right shift test (test 2) for a
        small message.}
}


@Article{pas01:mpi-app,
  author = 	 {G. Passoni and P. Cremonesi and G. Alfonsi},
  title = 	 {Analysis and implementation of a parallelization strategy on
                  a {N}avier-{S}tokes solver for shear flow simulations},
  journal = 	 {Parallel Computing},
  year = 	 2001,
  volume =	 27,
  number =	 13,
  pages =	 {1665--1685},
  month =	 DEC,
  abstract = {A parallel computational solver for the unsteady incompressible
        three-dimensional Navier-Stokes equations implemented for the
        numerical simulation of shear flow cases is presented. The
        computational algorithms include Fourierexpansions in the
        streamwise and spanwise directions, second-order centered finite
        differences in the direction orthogonal to the solid walls,
        third-order Runge-Kutta procedure in time in which both
        convective and diffusive terms are treated explicitly; the
        fractional step method is used for time marching. Based on the
        numerical algorithms implemented within the computational
        solver, three different (MPI based) parallelization strategies
        are devised. The three schemes are evaluated with particular
        attention to the impact of the communications onto the whole
        computational procedure, and one ofthem is implemented.
        Computations are executed on two different parallel machines and
        results are shown in terms of parallel performance. Processes
        using different number of processors combined with different
        number of computational grid points are tested.}
}


@Article{ber01:mpi-openmp,
  author = 	 {J. Y. Berthou and E. Fayolle},
  title = 	 {Comparing {OpenMP}, {HPF}, and {MPI} programming: A study
                  case}, 
  journal = 	 {International Journal of High Performance Computing
                  Applications}, 
  year = 	 2001,
  volume =	 15,
  number =	 3,
  pages =	 {297--309},
  abstract = {This paper presents a comparison of three programming
        models-OpenMP, HPF, and MPI-applied to a diphasic compressible
        fluid mechanics code. The parallelization analysis is conducted,
        and the authors also present the experimental results obtained
        on various platforms: a Compaq Proliant 6000 (4 processors), a
        Cray T3E-750 (300 processors), an HP Class V (16 processors), a
        SG1Origin 2000 (32 processors), a cluster of PCs, and a COMPAQ
        SC 232 (232 processors). These experimental results will be
        discussed according to the following criteria: efficiency,
        scalability, maintainability, developing costs, and portability.
        As a conclusion, the authors present the parallelization
        strategy recommended for codes comparable to ECOSS.}
}


@Article{ber01:mpi-alg,
  author = 	 {L. Bergamaschi and I. Moret and G. Zilli},
  title = 	 {Inexact {Q}uasi-{N}ewton methods for sparse systems of
                  nonlinear equations}, 
  journal = 	 {Future Generation Computer Systems},
  year = 	 2001,
  volume =	 18,
  number =	 1,
  pages =	 {41--53},
  month =	 SEP,
  abstract = {In this paper, we present the results obtained by solving
        consistent sparsesystems of n nonlinear equations F(x) = 0, by a
        Quasi-Newton method combined with a p block iterative
        row-projection linear solver of Cimmino type, 1 less than or
        equal to $p << n$. Under weak regularity conditions for F, it is
        proved that this Inexact Quasi-Newton method has a local, linear
        convergence in the energy norm induced by the preconditioned
        matrix HA, where A is an initial guess of the Jacobian matrix,
        and it may converge too superlinearly. The matrix H =
        [A(1)(+),...,A(i)(+),...,A(p)(+)], where A(i)(+) =
        A(i)(T)(A(i)A(i)(T))(-1) is the Moore-Penrose pseudo-inverse of
        the mi x n block A(i), the preconditioner. A simple partitioning
        of the Jacobian matrix was used for solving a set of nonlinear
        test problems with sizes ranging from 1024 to 131 072 on the CRAY
        T3E under the MPI environment.}
}



@Article{neo01:mpi-tool,
  author = 	 {N. Neophytou and P. Evripidou},
  title = 	 {{Net-dbx}: A web-based debugger of {MPI} programs over
                  low-bandwidth lines}, 
  journal = 	 {IEEE Transactions on Parallel and Distributed Systems},
  year = 	 2001,
  volume =	 12,
  number =	 9,
  pages =	 {986--995},
  month =	 SEP,
  abstract = {This paper describes Net-dbx, a tool that utilizes Java and
        other World Wide Web tools for the debugging of MPI programs
        from anywhere in the Internet. Net-dbx is a source-level
        interactive debugger with the full power of gdb(the GNU
        Debugger) augmented with the debug functionality of the
        public-domain MPI implementation environments. The main effort
        was on a low overhead, yet powerful, graphical interface
        supported by low-bandwidth connections.The portability of the
        tool is of great importance as well because it enables the tool
        to be used on heterogeneous nodes that participate in an MPI
        multicomputer. Both needs are satisfied a great deal by the use
        of WWW browsing tools and the Java programming language. The
        user of our system simply points his/her browser to the Net-dbx
        page, logs in to the destination system, and starts debugging by
        interacting with the tool, just as with any GUIenvironment. The
        user can dynamically select which MPI processes to view/debug. A
        special WWW-based environment has been designed and implemented
        to host the system prototype.}
}


@Article{ree01:mpi-alg,
  author = 	 {J. S. Reeve and A. D. Scurr and J. H. Merlin},
  title = 	 {Parallel versions of {S}tone's strongly implicit algorithm},
  journal = 	 {Concurrency and Computation-Practice \& Experience,},
  year = 	 2001,
  volume =	 13,
  number =	 12,
  pages =	 {1049--1062},
  month =	 OCT,
  abstract = {In this paper, we describe various methods of deriving a
        parallel version of Stone's Strongly Implicit Procedure (SIP)
        for solving sparse linear equations arising from finite
        difference approximation to partial differential equations
        (PDEs). Sequential versions of this algorithm have been very
        successful in solving semi-conductor, heat conduction and flow
        simulation problems and an efficient parallel version would
        enable much larger simulations to be run. An initial
        investigation of various parallelizing strategies was undertaken
        using a version of high performance Fortran (HPF) and the best
        methods were reprogrammed using the MPI message passing
        libraries for increased efficiency. Early attempts concentrated
        on developing a parallel version of the characteristic wavefront
        computation pattern of the existing sequential SIP code.
        However, a red-black ordering of grid points, similar to that
        used in parallel versions of the Gauss-Seidel algorithm, is
        shown to be far more efficient. The results of both the
        wavefront and red-black MPI based algorithms are reported for
        various size problems and number of processors on a sixteen node
        IBM SP2.}
}


@Article{kre01:mpi-app,
  author = 	 {H. Kremer and F. May and S. Wirtz},
  title = 	 {The influence of furnace design on the {NO} formation in
                  high temperature processes},
  journal = 	 {Energy Conversion and Management},
  year = 	 2001,
  volume =	 42,
  number =	 {15--17},
  pages =	 {1937--1952},
  month =	 {Oct-Nov},
  abstract = {High temperature processes produce high NO, emissions due to
        their elevatedworking temperatures. Strong regulations for
        emissions of pollutants [1] from industrial plants lead the
        operators to optimize their furnaces. In this paper a
        three-dimensional mathematical model for turbulent flow and
        combustion on the basis of turbulence-chemistry interactions and
        radiative heat transfer taking into account spectral effects of
        surrounding walls and combustion gases is described. The
        transport equation for radiative intensity was split into
        different wavelength ranges. A block-structured finite
        volumegrid with local refinements was used to solve the
        governing equations. Thecalculation domain is subdivided into a
        number of subdomains which are linked within the solver based on
        the message passing interface (MPI) library.Computed
        distributions of velocity, temperature, species distribution and
        heat fluxes are given. Results of a parametric study in a
        producing horseshoe furnace by increasing the height of the
        furnace with regard to NO, concentration distributions are
        presented.}
}


@Article{he01:mpi-alg,
  author = 	 {X. He and C. H. Huang},
  title = 	 {Communication efficient {BSP} algorithm all nearest smaller
                  values problem}, 
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2001,
  volume =	 61,
  number =	 10,
  pages =	 {1425--1438},
  month =	 OCT,
  abstract = {We present a BSP (Bulk Synchronous Parallel) algorithm for
        solving the All Nearest Smaller Values Problem (ANSVP), a
        fundamental problem in both graphtheory and computational
        geometry. Our algorithm achieves optimal sequential computation
        time and uses only three communication supersteps. In the worst
        case, each communication phase takes no more than an (n/p +
        p)-relation, where p is the number of the processors. In
        addition, our average-case analysis shows that, on random
        inputs, the expected communication requirements for all three
        steps are bounded above by a p-relation, which is independent of
        the problem size n. Experiments have been carried out on an SGI
        Origin 2000 with 32 R10000 processors and a SUN Enterprise 4000
        multiprocessing server supporting 8 UltraSPARC processors, using
        the MPI libraries. The results clearly demonstrate the
        communication efficiency and load balancing for computation.}
}


@Article{bea01:mpi-app,
  author = 	 {O. Beaumont and V. Boudet and F. Rastello and Y. Robert},
  title = 	 {Matrix multiplication on heterogeneous platforms},
  journal = 	 {IEEE Transactions on Parallel and Distributed Systems},
  year = 	 2001,
  volume =	 12,
  number =	 10,
  pages =	 {1033-1051},
  month =	 OCT,
  abstract =   {In this paper, we address the issue of implementing matrix
        multiplication on heterogeneous platforms. We target two
        different classes of heterogeneouscomputing resources:
        heterogeneous networks of workstations and collections of
        heterogeneous clusters. Intuitively, the problem is to load
        balance the work with different speed resources while minimizing
        the communication volume. We formally state this problem in a
        geometric framework and prove itsNP-completeness. Next, we
        introduce a (polynomial) column-based heuristic,which turns out
        to be very satisfactory: We derive a theoretical performance
        guarantee for the heuristic and we assess its practical
        usefulness through MPI experiments.}
}


@Article{ban01:mpi-impl,
  author = 	 {M Banikazemi and R. K. Govindaraju and R. Blackmore and
                  D. K. Panda},
  title = 	 {MPI-LAPI: An efficient implementation of MPI for IBM RS/6000 SP systems},
  journal = 	 {IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS},
  year = 	 2001,
  volume =	 12,
  number =	 10,
  pages =	 {1081--1093},
  month =	 OCT,
  abstract  =    {The IBM RS/6000 SP system is one of the most cost-effective
        commercially available high performance machines. IBM RS/6000 SP
        systems support the Message Passing Interface standard (MPI) and
        LAPI. LAPI is a low level, reliable, and efficient one-sided
        communication API library implemented on IBM IRS/6000 SP
        systems. This paper explains how the high performance of the
        LAPI library has been exploited in order to implement the MPI
        standard more efficiently than the existing MPI. It describes
        how to avoid unnecessary data copies at both the sending and
        receiving sides for such an implementation. The resolution of
        problems arising from the mismatches between the requirements of
        the MPI standard and the features of LAPI is discussed. As a
        result of this exercise, certain enhancements to LAPI are
        identified to enable an efficient implementation of MPI on LAPI.
        The performance of the new implementation of MPI is compared
        with that of the underlying LAPI itself. The latency (in polling
        and interrupt modes) and bandwidth of our new implementation is
        compared with that of the native MPI implementation on RS/6000
        SP systems. The results indicate that the MPI implementation on
        LAPI performs comparably to or better than the original MPI
        implementation in most cases. Improvements of up to 17.3 percent
        in polling mode latency, 35.8 percent in interrupt mode latency,
        and 20.9 percent in bandwidth are obtained for certain message
        sizes. The implementation of MPI on top of LAPI also outperforms
        the native MPI implementation for the NAS Parallel Benchmarks.}
}


@Article{liRa01:mpi-app,
  author = 	 {M. Z. Li and O. F. Rana and D. W. Walker},
  title = 	 {Wrapping {MPI}-based legacy codes as {Java/CORBA}
                  components}, 
  journal = 	 {Future Generation Computer Systems},
  year = 	 2001,
  volume =	 18,
  number =	 2,
  pages =	 {213--223},
  month =	 OCT, 
  abstract = {Techniques for wrapping an MPI-based molecular dynamics (MD)
        simulation code as Java/CORBA components, for use within a
        distributed component based problem solving environment
        (CB-PSE), is presented. A legacy code for simulating a
        Lennard-Jones fluid is first wrapped as a single CORBA object,
        followed by division of the code into computational sub-units,
        where each sub-unit is wrapped as a CORBA object containing MPI
        calls, and run on a cluster of workstations - enabling different
        MPI implementations to inter-operate. Using a Java
        implementation, users can submit simulation tasks through a
        Webbased inter-face, without needing to know implementation
        details of the legacy code, or the exact interaction between
        sub-units within the code. We provide performance comparisons of
        wrapping the entire MD code as a single object versus wrapping
        sub-units within it, and offer a simple performance model to
        explain our findings.}
}

@Article{beau01:mpi-app,
  author = 	 {O. Beaumont and V. Boudet and A. Petitet and
                  F. Rastello and Y. Robert},
  title = 	 {A proposal for a heterogeneous cluster {ScaLAPACK}
                  (dense linear solvers)}, 
  journal = 	 {IEEE Transactions on Computers},
  year = 	 2001,
  volume =	 50,
  number =	 10,
  pages =	 {1052--1070},
  month =	 OCT,
  abstract = {In this paper, we study the implementation of dense linear
        algebra kernels,such as matrix multiplication or linear system
        solvers, on heterogeneous networks of workstations. The uniform
        block-cyclic data distribution scheme commonly used for
        homogeneous collections of processors limits the performance of
        these linear algebra kernels on heterogeneous grids to the speed
        of the slowest processor. We present and study more
        sophisticated data allocation strategies that balance the load
        on heterogeneous platforms with respect to the performance of
        the processors. When targeting unidimensional grids, the
        load-balancing problem can be solved rather easily. When
        targeting two-dimensional grids, which are the key to
        scalability and efficiency for numerical kernels, the problem
        turns out to be surprisingly difficult. We formally state the 2D
        load-balancing problem and prove its NP-completeness. Next, we
        introduce a data allocation heuristic, which turns out to be
        very satisfactory: Its practical usefulness is demonstrated by
        MPI experiments conducted with a heterogeneous network of
        workstations.}
}

@Article{corn01:mpi-app,
  author = 	 {C. F. Cornwell and L. T. Wille and Y. G. Wu and
                  F. H. Sklar},
  title = 	 {Parallelization of an ecological landscape model by
                  functional decomposition},
  journal = 	 {Ecological Modelling},
  year = 	 2001,
  volume =	 144,
  pages =	 {13-20},
  month =	 OCT,
  abstract = {A functional scheme is described to parallelize computer
        simulations of grid-based ecological landscape models. The
        method is implemented using the Message Passing Interface
        protocol and is applied to the Everglades LandscapeVegetation
        Model. On a two-processor system, the speed-up is satisfactory
        and the overall performance of the program is competitive with
        traditional parallelization techniques such as geometrical
        decomposition. The method isdiscussed, timing information is
        provided for three different parallel machines, and some further
        developments are indicated.}
}


@Article{sama01:mpi-app,
  author = 	 {M. Y. Saman and D. J. Evans},
  title = 	 {Distributed computing on cluster systems},
  journal = 	 {International Journal of Computer Mathematics},
  year = 	 2001,
  volume =	 78,
  number =	 3,
  pages =	 {383--397},
  abstract = {Message Passing Interface (MPI) allows a group of computers in a
        network tobe specified as a cluster system. It provides the
        routines for task activation and communication. Writing programs
        for a cluster system is a difficult job. In this paper: the
        Message+passing Interface is presented. Parallel programs using
        the WMPI, a version of MPI, to solve the pi(pi) calculation the
        quick sort algorithm and the Torsion problem are presented. The
        programs are written and compiled in Microsoft Visual C++.}
}


@Article{raas01:mpi-app,
  author = 	 {S. Raasch and M. Schroter},
  title = 	 {{PALM}---{A} large-eddy simulation model performing
                  on massively parallel computers},
  journal = 	 {Meteorologische Zeitschrift},
  year = 	 2001,
  volume =	 10,
  number =	 5,
  pages =	 {363--372},
  abstract = {An existing code of a large-eddy simulation (LES) model for the
        study of turbulent processes in the atmospheric and oceanic
        boundary layer has been completely recoded for use on massively
        parallel systems with distributed memory. Parallelization is
        achieved by two-dimensional domain decomposition and
        communication is realized by the message passing interface
        (MPI). Periodic boundary conditions, which are used in both
        horizontal directions, helpedto minimize the parallelization
        effort. The performance of the new PArallelized LES Model (PALM)
        is excellent on SGI/Cray-T3E systems and an almost linear
        speed-up is achieved up to very large numbers of processors.
        Parallelization strategy and model performance is discussed and
        validation experiments as well as future applications are
        presented.}
}

@Article{lu01:mpi-app,
  author = 	 {P. Lu},
  title = 	 {Integrating bulk-data transfer into the {A}urora
                  distributed shared data system},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2001,
  volume =	 61,
  number =	 11,
  pages =	 {1609--1632},
  month =	 NOV, 
  abstract = {The Aurora distributed shared data system implements a
        shared-data abstraction on distributed-memory platforms, such as
        clusters, using abstract data types. Aurora programs are written
        in C++ and instantiate shared-data objects whose data-sharing
        behaviour can be optimized using a novel technique called scoped
        behaviour. Each object and each phase of the computation
        (i.e.,use-context) can be independently optimized with
        per-object and per-context flexibility. Within the scoped
        behaviour framework, optimizations such asbulk-data transfer can
        be implemented and made available to the application programmer.
        Scoped behaviour carries semantic information regarding the
        specific data-sharing pattern through various layers of
        software. We describe how the optimizations are integrated from
        the uppermost application-programmer layers down to the lowest
        UDP-based layers of the Aurora system. A bulk-data transfer
        network protocol bypasses some bottlenecks associated withTCP/IP
        and achieves higher performance on an ATM network than either
        TreadMarks (distributed shared memory) or MPICH (message
        passing) for matrix multiplication and parallel sorting.}
}

@Article{brig01:mpi-impl,
  author = 	 {R. Brightwell and S. Plimpton},
  title = 	 {Scalability and performance of two large {L}inux clusters},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2001,
  volume =	 61,
  number =	 11,
  pages =	 {1546--1569},
  month =	 NOV, 
  abstract = {In this paper, we present performance results from several
        parallel benchmarks and applications on two large Linux clusters
        at Sandia National Laboratories. We compare the results on the
        Linux clusters to performance obtainedon a traditional
        distributed-memory massively parallel processing machine,the
        Intel TeraFLOPS. We discuss the characteristics of these
        machines thatinfluence the performance results and identify the
        key components of the system that are important to allow for
        scalability of commodity-based PC clusters to hundreds and
        possibly thousands of processors.}
}


@Article{diPi01:mpi-app,
  author = 	 {M. Di Pierro},
  title = 	 {Matrix distributed processing: a set of {C++} tools
                  for implementing generic lattice computations on
                  parallel systems}, 
  journal = 	 {Computer Physics Communications},
  year = 	 2001,
  volume =	 141,
  number =	 1,
  pages =	 {98--148},
  month =	 NOV,
  abstract = {We present a set of programming tools (classes and functions
        written in C++and based on Message Passing Interface) for fast
        development of generic parallel (and non-parallel) lattice
        simulations. They are collectively calledMDP 1. 2. These
        programming tools include classes and algorithms for matrices,
        random number generators, distributed lattices (with arbitrary
        topology), fields and parallel iterations. No previous knowledge
        of MPI is required in order to use them. Some applications in
        electromagnetism, electronics,condensed matter and lattice QCD
        are presented,}
}

@Article{ahan01:mpi-app,
  author = 	 {X. Zhang and B. Wang and Z. Z. Ji},
  title = 	 {Performance of a parallel finite difference
                  atmospheric general circulation model},
  journal = 	 {Advances in Atmospheric Sciences},
  year = 	 2001,
  volume =	 18,
  number =	 6,
  pages =	 {1175--1184},
  abstract = {A new version of the Institute of Atmospheric Physics (IAP)
        9-Layer (9L) atmospheric general circulation model (AGCM)
        suitable for Massively Parallel Processor (MPP) has been
        developed. This paper presents the principles of the parallel
        code design and examines its performance on a variety of
        state-of-the-art parallel computers in China. Domain
        decomposition strategy is used to achieve parallelism that is
        implemented by Message Passing Interface (MPI). Only the one
        dimensional domain decomposition algorithm is shown to scale
        favorably as the number of processors is increased.}
}

@Article{boul01:mpi-app,
  author = 	 {C. Bouldin and J. Sims and H. Hung and J. J. Rehr
                  and A. L. Ankudinov},
  title = 	 {Rapid calculation of x-ray absorption near edge
                  structure using parallel computation},
  journal = 	 {X-Ray Spectrometry},
  year = 	 2001,
  volume =	 30,
  number =	 6,
  pages =	 {431--434},
  month =	 {Nov.-Dec.},
  abstract = {Modeling x-ray absorption near edge structure (XANES) requires
        computationally intensive calculations. We show that parallel
        processing can reduce thetime required for XANES calculations by
        a factor of up to 50 over standarddesktop computers. Parallel
        processing is implemented in our codes using the Message Passing
        Interface (MPI) and is portable across most hardware
        andoperating systems. We demonstrate the inverse scaling of the
        parallel algorithm with the number of processors, and discuss
        how this approach to parallel processing could be implemented in
        other multiple-scattering calculations. Faster calculations
        should improve the applicability of ab initio XANESstudies to
        many materials science problems.}
}

@Article{behr01:mpi-app,
  author = 	 {M. Behr},
  title = 	 {Stabilized space-time finite element formulations
                  for free-surface flows}, 
  journal = 	 {Communications in Numerical Methods in Engineering},
  year = 	 2001,
  volume =	 17,
  number =	 11,
  pages =	 {813--819},
  month =	 NOV,
  abstract = {Aspects of a method for 3D finite element computation of
        unsteady, incompressible free-surface flow are presented. The
        approach is based on the deformable-spatial-domain/stabilized
        space-time (DSD/SST) finite element formulation, which takes
        automatically into account the deformation of the elementsin
        response to the motion of the free surface. The free-surface
        elevation is governed by a kinematic free-surface condition,
        which is also solved with a stabilized formulation. A new
        governing equation and stabilized formulation is derived for
        cases where the channel walls are not vertical. The parallel
        implementation based on MPI message-passing standard is fully
        portable, and have been demonstrated to be scalable on a range
        of architectures. A 3D computation of a flow past a spillway of a
        dam is shown as an example application.}
}


@Article{he02:mpi-app,
  author = 	 {F. S. He and H. Wu},
  title = 	 {An efficient parallel implementation of the
                  {E}verglades {L}andscape {F}ire {M}odel using checkpointing},
  journal = 	 {Parallel Computing},
  year = 	 2002,
  volume =	 28,
  number =	 1,
  pages =	 {65--82},
  month =	 JAN,
  abstract = {This paper presents a low-communication overhead and
        high-performance data parallelism implementation of the
        Everglades Landscape Fire Model (ELFM) ina network of
        workstations (NOWs). ELFM is parallelized under Message Passing
        Interface (MPI). Checkpointing and rollback technologies are
        used to handle the spread of fire which is a dynamic and
        irregular component of the model. A parallel application model
        with the mixture of a variety of asynchronous and synchronous
        computation is developed. In this model, the asynchronous
        computation is dominant and synchronous computation is
        intermittent. The length of each synchronous computation also
        varies. Based on the developed model, a synchronous
        check-pointing mechanism is used in the parallel ELFM code under
        MPI. A simulation is conducted and results show that the
        performance of the ELFM under MPI is significantly enhanced by
        the application of checkpointing and rollback. }
}

@Article{soda02:mpi-app,
  author = 	 {A. C. Sodan},
  title = 	 {Applications on a multithreaded architecture: A case
                  study with {EARTH-MANNA}}, 
  journal = 	 {Parallel Computing},
  year = 	 2002,
  volume =	 28,
  number =	 1,
  pages =	 {3--33},
  month =	 JAN, 
  abstract = {Multithreading offers benefits with respect to the formulation
        of irregulardynamic programs and their dynamic scheduling, load
        balancing and interaction. Furthermore, low-cost communication
        on distributed-memory machines by remote-memory access is
        provided by some systems for efficient communication. EARTH is
        one of the few systems which combines both, while most other
        systems either focus on communication or provide multithreading
        in shared-memory environments. Dynamic irregular applications
        are often awkward to parallelize on distributed memory when
        using SPMD style programming via MPI and show different
        requirements for formulation. In addition, dynamic
        irregularapplications also may show a fairly tight data
        coupling. Systems like EARTH are beneficial then, because they
        specifically support large number of small data exchanges by
        providing short startup times and the tolerance of even small
        latencies (offering very fine-grain threads). However, static
        regular applications with tight data coupling are supported too.
        On the exampleof EARTH, this paper investigates the benefits of
        low-cost communication and multithreading, parallelizing three
        AI applications with medium to high communication intensity. We
        present experimental results obtained on the MANNA machine.}
}

@Article{wang02:mpi-app,
  author = 	 {P. Wang and K. Y. Liu and T. Cwik and R. Green},
  title = 	 {{MODTRAN} on supercomputers and parallel computers},
  journal = 	 {PARALLEL COMPUTING},
  year = 	 2002,
  volume =	 28,
  number =	 1,
  pages =	 {53--64},
  month =	 JAN,
  abstract = {To enable efficient reduction of large data sets such as is done
        in the Airborne Visible/Infrared Imaging Spectrometer (AVIRIS)
        project at the Jet Propulsion Laboratory (JPL), a high
        performance version of MODTRAN is essential. One means to
        accomplish this is to apply the computational resources of
        parallel computer systems. In our present work, a flexible,
        parallel version of MODTRAN has been implemented on the Cray
        T3E, the HP SPP2000, and a Beowulf-class cluster computer using
        domain decomposition techniques and the Message Passing
        Interface (MPI) library. In this paper, porting the sequential
        MODTRAN to various platforms is discussed; strategies of
        designing a parallel version of MODTRAN are developed; detailed
        implementation for a parallel MODTRAN is reported, and
        performance data of the parallel code on various computers are
        presented. Near linear scaling performance of parallel MODTRAN
        has been obtained, and comparisons of wallclock time are made
        among various supercomputers and parallel computers. The
        parallel version of MODTRAN gives excellent speedup, which
        dramatically reduces total data processing time for many
        applications such as the AVIRIS project at JPL.}
}

@Article{acac02:mpi-impl,
  author = 	 {M. Acacio and O. Canovas and J. M. Garcia and
                  P. E. Lopez-de-Teruel}, 
  title = 	 {{MPI-Delphi}: an {MPI} implementation for visual
                  programming environments and heterogeneous computing},
  journal = 	 {Future Generation Computer Systems},
  year = 	 2002,
  volume =	 18,
  number =	 3,
  pages =	 {317--333},
  month =	 JAN, 
  abstract = {The goal of a parallel program is to reduce the execution time,
        compared tothe fastest sequential program solving the same
        problem. Parallel programming is growing due to the widespread
        use of network of workstations (NOWs) or powerful PCs in
        high-performance computing. Because the hardware components are
        all commodity devices, NOWs are much more cost-effective than
        custom machines with similar technology. In this environment,
        the typical programming model used has been message-passing and
        the MPI library has become the standard in the
        distributed-memory computing model. On the other hand, visual
        programming environments try to simply the task of developing
        applications. They provide programmers with several standard
        components for creating programs. Delphi constitutes one of the
        most popular visual programming environments nowadays in the
        Windows market place. In this paper, we presentMPI-Delphi, an
        implementation of MPI for writing parallel applications using
        Delphi visual programming environment. We show how MPI-Delphi
        has been developed, and how it makes possible to manage a
        cluster of homogeneous/heterogeneous PCs. Two examples of use of
        MPI-Delphi in a heterogeneous clusterof workstations with a
        mixture of Windows and Linux operating systems are also
        included. The MPI-Delphi interface is suitable for some specific
        kindsof problems, such as monitoring parallel programs of long
        execution time, or computationally intensive graphical
        simulations. In addition, MPI-Delphihas proven to be a good tool
        for research, as the development of new algorithms can be
        carried out quickly and, therefore, time spent on the debugging
        of such algorithms is reduced. Finally, we conclude by
        explaining some of the tasks we think MPI-Delphi is suitable
                  for.}
}

@Article{thak02:mpi-impl,
  author = 	 {R. Thakur and W. Gropp and E. Lusk},
  title = 	 {Optimizing noncontiguous accesses in {MPI-IO}},
  journal = 	 {Parallel Computing},
  year = 	 2002,
  volume =	 28,
  number =	 1,
  pages =	 {83--105},
  month =	 JAN,
  abstract = {The I/O access patterns of many parallel applications consist of
        accesses to a large number of small, noncontiguous pieces of
        data. If an application's I/O needs are met by making many
        small, distinct I/O requests, however, the I/O performance
        degrades drastically. To avoid this problem, MPI-IO allows users
        to access noncontiguous data with a single I/O function call,
        unlike in Unix I/O. In this paper, we explain how critical this
        feature of MPI-IO is for high performance and how it enables
        implementations to perform optimizations. We first provide a
        classification of the different ways of expressing an
        application's I/O needs in MPI-IO - we classify them into four
        levels, called levels 0-3. We demonstrate that, for applications
        with noncontiguous access patterns, the I/O performance improves
        dramatically if userswrite their applications to make level-3
        requests (noncontiguous, collective) rather than level-0
        requests (Unix style). We then describe how our MPI-IO
        implementation, ROMIO, delivers high performance for
        noncontiguous requests. We explain in detail the two key
        optimizations ROMIO performs: data sieving for noncontiguous
        requests from one process and collective I/O for noncontiguous
        requests from multiple processes. We describe how we have
        implemented these optimizations portably on multiple machines
        and file systems,controlled their memory requirements, and also
        achieved high performance. We demonstrate the performance and
        portability with performance results forthree applications - an
        astrophysics-application template (DIST3D), the NAS BTIO
        benchmark, and an unstructured code (UNSTRUC) - on five
        different parallel machines: HP Exemplar, IBM SP, Intel Paragon,
        NEC SX-4, and SGI Origin2000. }
}

@Article{hell02:mpi-impl,
  author = 	 {H. Hellwagner and M. Ohlenroth},
  title = 	 {{VI} architecture communication features and
                  performance on the {G}iganet cluster {LAN}},
  journal = 	 {Future Generation Computer Systems},
  year = 	 2002,
  volume =	 18,
  number =	 3,
  pages =	 {421--433},
  month =	 JAN,
  abstract = {The virtual interface (VI) architecture standard was developed
        to satisfy the need for a high throughput, low latency
        communication system required for cluster computing. VI
        architecture aims to close the performance gap between the
        bandwidths and latencies provided by the communication hardware
        andvisible to the application, respectively, by minimizing the
        software overhead on the critical path of the communication.
        This paper presents the results of a performance study of one VI
        architecture hardware implementation, the Giganet cLAN (cluster
        LAN). The focus of the study is to assess and compare the
        performance of different VI architecture data transfer modes and
        specific features that are available to higher-level
        communication software like MPI in order to aid the implementor
        to decide which VI architecture options to employ for various
        communication scenarios. Examples of such options include the
        use of send/receive vs. RDMA data transfers, polling vs.
        blocking to check completion of communication operations,
        multiple VIs, completion queues and scatter capabilities of VI
        architecture. }
}

@Article{liLi01:mpi-app,
  author = 	 {Y. M. Li and J. L. Liu and T. S. Chao and S. M. Sze},
  title = 	 {A new parallel adaptive finite volume method for the
                  numerical simulation of semiconductor devices},
  journal = 	 {Computer Physics Communications},
  year = 	 2001,
  volume =	 142,
  number =	 {1--3},
  pages =	 {285--289},
  month =	 DEC,
  abstract = {Based on adaptive finite volume approximation, a posteriori
        error estimation, and monotone iteration, a novel system is
        proposed for parallel simulations of semiconductor devices. The
        system has two distinct parallel algorithms to perform a
        complete set of I-V simulations for any specific device model.
        The first algorithm is a domain decomposition on I-irregular
        unstructured meshes whereas the second is a parallelization of
        multiple I-V points. Implemented on a Linux cluster using
        message passing interface libraries, both algorithms are shown
        to have excellent balances on dynamic loading and hence result
        in efficient speedup. Compared with measurement data,
        computational results of sub-micron MOSFET devices are given to
        demonstrate the accuracy and efficiency of the system. }
}

@Article{iovi01:mpi-app,
  author = 	 {M. Iovieno and C. Cavazzoni and D. Tordella},
  title = 	 {A new technique for a parallel dealiased
                  pseudospectral {N}avier-{S}tokes code},
  journal = 	 {Computer Physics Communications},
  year = 	 2001,
  volume =	 141,
  number =	 3,
  pages =	 {365--374},
  month =	 DEC,
  abstract = {A novel aspect of a parallel procedure for the numerical
        simulation of the solution of the Navier-Stokes equations
        through the Fourier-Galerkin pseudospectral method is presented.
        It consists of a dealiased ("3/2" rule) transposition of the
        data that organizes the computations in the distributed
        direction in such a way that whenever a Fast Fourier Transform
        must be calculated, the algorithm will employ data stored solely
        an the proper memory of the processor which is computing it.
        This provide for the employment of standard routines for the
        computations of the Fourier transform. The aliasing removal
        procedure has been directly inserted into the transposition
        algorithm. The code is written for distributed memory computers,
        but not specifically for a peculiar architecture. The use on a
        variety of machines is allowedby the adoption of the Message
        Passing Interface library. The portability of the code is
        demonstrated by the similar performances, in particular the high
        efficiency, that all the machines tested show up to a number of
        parallel processors equal to 1/2 the truncation parameter N/2.
        Explicit time integration is used. The present code organization
        is relevant to physical and mathematical problems which require
        a three dimensional spectral treatment.}
}

@Article{kepk01:mpi-app,
  author = 	 {A. Kepkep and U. Ravaioli and B. Winstead},
  title = 	 {Cluster-based parallel 3-{D} {M}onte {C}arlo device
                  simulation}, 
  journal = 	 {VLSI Design},
  year = 	 2001,
  volume =	 13,
  number =	 {1--4},
  pages =	 {51--56},
  abstract = {The recent improvements in the performance of commodity computer
        have created very favorable conditions for building high
        performance parallel machines from computer clusters. These are
        very attractive for 3-D device simulation, necessary to model
        properly carrier-carrier interaction and granular doping effects
        in deeply scaled silicon devices. We have developed a
        parallel3-D Monte Carlo simulation environment customized for
        clusters using the Message Passing Library (MPI). The code has
        been tested on the supercluster of NCSA at the University of
        Illinois. We present here test results for an n-i-n diode
        structure, along with an analysis of performance for two
        different domain decomposition schemes.}
}

@Article{beck02:mpi-app,
  author = 	 {M. Be\v{c}ka and G. Ok\v{s}a and M. Vajter\v{s}ic},
  title = 	 {Dynamic ordering for a parallel block-Jacobi SVD algorithm},
  journal = 	 {Parallel Computing},
  year = 	 2002,
  volume =	 28,
  number =	 2,
  pages =	 {243--262},
  month =	 FEB,
  abstract = {A new approach for the parallel computation of singular value
        decomposition(SVD) of matrix A is an element of C-mxn is
        proposed. Contrary to the known algorithms that use a static
        cyclic ordering of subproblems simultaneously solved in one
        iteration step, the proposed implementation of the two-sided
        block-Jacobi method uses a dynamic ordering of subproblems. The
        dynamic ordering takes into account the actual status of matrix
        A. In each iterationstep, a set of the off-diagonal blocks is
        determined that reduces the Frobenius norm of the off-diagonal
        elements of A as much as possible and, at the same time, can be
        annihilated concurrently. The solution of this task is
        equivalent to the solution of the maximum-weight perfect
        matching problem. The greedy algorithm for the efficient
        solution of this problem is presented. The computational
        experiments with both types of ordering, incorporated into the
        two-sided block-Jacobi method, were performed on an SGI - Cray
        Origin 2000 parallel computer using the Message Passing
        Interface (MPI). The results confirm. that the dynamic ordering
        is much more efficient with regard to the amount of work
        required for the computation of SVD of a given accuracy than the
        static cyclic ordering. }
}

@Article{lian02:mpi-app,
  author = 	 {Y. Liang and J. Weston and M. Szularz},
  title = 	 {Generalized least-squares polynomial preconditioners
                  for symmetric indefinite linear equations},
  journal = 	 {Parallel Computing},
  year = 	 2002,
  volume =	 28,
  number =	 2,
  pages =	 {323--341},
  month =	 FEB, 
  abstract = {Polynomial preconditioners. are frequently used in a parallel
        environment for the computation of the solution of large-scale
        sparse linear equations (Ax = b) because of their easy
        implementation and trivial parallelization. With respect to
        symmetrical indefinite (SID) linear systems, the use of
        generalized least-squares (GLS) polynomial preconditioning is
        preferable to other polynomial preconditioning methods because
        of the ability to use a three-term recurrence relationship and
        the low implementation costs. The GLS preconditioning polynomial
        and its influence on the flexible generalized minimized residual
        (FGMRES) solver are discussed in this paper. The orthogonal
        polynomials required in the solution of the least-squares
        approximation problem are constructed using the Stieltjes
        procedure in multiple disjoint intervals which exclude the
        origin. The time-consuming numerical integration associated with
        this procedure is computed efficiently using Chebyshev
        polynomials of the first kind and the GLS polynomial
        reconditioned FGMRES algorithm is implemented using MPI in a
        highly parallel IBM SP2 environment. Experimental results using
        classical benchmark systems are presented and compared with
        those obtained using the recently developed SPAI preconditioned
        Bi-CGSTAB iterative method. The performance of the GLS
        preconditioned FGMRES solver is critically accessed.}
}

@Article{beka02:mpi-app,
  author = 	 {C. Bekas and E. Gallopoulos},
  title = 	 {Parallel computation of pseudospectra by fast descent},
  journal = 	 {Parallel Computing},
  year = 	 2002,
  volume =	 28,
  number =	 2,
  pages =	 {223--242},
  month =	 FEB,
  abstract = {The pseudospectrum descent method (PsDM) is proposed, a new
        parallel methodfor the computation of pseudospectra. The idea
        behind the method is to usepoints from an already existing
        pseudospectrum level curve partial derivativeA(epsilon), to
        generate in parallel the points of a new level curve partial
        derivativeA(delta) such that delta $<$ epsilon. This process can
        be continued for several steps to approximate several
        pseudospectrum level curves lying inside the original curve. It
        is showed via theoretical analysis and experimental evidence
        that PsDM is embarrassingly parallel, like GRID, and that it
        adjusts to the geometric characteristics of the pseudospectrum;
        in particular it captures disconnected components. Results
        obtained on a parallel system using MPI validate the theoretical
        analysis and demonstrate interesting load-balancing issues. }
}

@Article{jian02:mpi-app,
  author = 	 {D. Jiang and W. Meleis and M. El-Shenawee and
                  E. Mizan and A. Ashouei and C. Rappaport},
  title = 	 {Parallel implementation of the steepest descent fast
                  multipole method ({SDFMM}) on a {B}eowulf cluster
                  for subsurface sensing applications}, 
  journal = 	 {IEEE Microwave and Wireless Components Letters},
  year = 	 2002,
  volume =	 12,
  number =	 1,
  pages =	 {24--26},
  month =	 JAN,
  abstract = {We present the parallel, MPI-based implementation of the SDFMM
        computer code using a thirty two-node Intel Pentium-based
        Beowulf cluster. The SDFMM isa fast algorithm that is a
        hybridization of the method of moments (MoMs), the fast
        multipole method (FMM), and the steepest descent integration
        path (SDP), which is used to solve large-scale linear systems of
        equations produced in electromagnetic scattering problems. An
        overall speedup of 7.2 has been achieved on the 32-processor
        Beowulf cluster and a significant reduced runtime is achieved on
        the 4-processor 667 MHz Alpha workstation.}
}

 
@Article{dehn02:mpi-app,
  author = 	 {F. Dehne and T. Eavis and S. Hambrusch and A. Rau-Chaplin},
  title = 	 {Parallelizing the data cube},
  journal = 	 {Distributed and Parallel Databases},
  year = 	 2002,
  volume =	 11,
  number =	 2,
  pages =	 {181--201},
  month =	 MAR,
  abstract = {We have implemented our parallel top-down data cube construction
        method in C++ with the MPI message passing library for
        communication and the LEDA library for the required graph
        algorithms. We tested our code on an eight processor cluster,
        using a variety of different data sets with a range of sizes,
        dimensions, density, and skew. Comparison tests were performed
        on a SunFire 6800. The tests show that our partitioning
        strategies generate a close to optimal load balance between
        processors. The actual run times observed show an optimal
        speedup of p.}
}

@Article{dewa02:mpi-app,
  author = 	 {Y. K. Dewaraja and M. Ljungberg and A. Majumdar and
                  A. Bose and K. F. Koral},
  title = 	 {A parallel {M}onte {C}arlo code for planar and
                  {SPECT} imaging: implementation, verification and
                  applications in {I-131 SPECT}}, 
  journal = 	 {Computer Methods and Programs in Biomedicine},
  year = 	 2002,
  volume =	 67,
  number =	 2,
  pages =	 {115--124},
  month =	 FEB,
  abstract = {This paper reports the implementation of the SIMIND Monte Carlo
        code on an IBM SP2 distributed memory parallel computer. Basic
        aspects of running Monte Carlo particle transport calculations
        on parallel architectures are described. Our parallelization is
        based on equally partitioning photons among the processors and
        uses the Message Passing Interface (MPI) library for
        interprocessor communication and the Scalable Parallel Random
        Number Generator (SPRNG) to generate uncorrelated random number
        streams. These parallelization techniques are also applicable to
        other distributed memory architectures.A linear increase in
        computing speed with the number of processors is demonstrated
        for Lip to 32 processors. This speed-up is especially
        significant in Single Photon Emission Computed Tomography
        (SPECT) simulations involvinghigher energy photon emitters,
        where explicit modeling of the phantom and collimator is
        required. For I-131, the accuracy of the parallel code is
        demonstrated by comparing simulated and experimental SPECT
        images from a heart/thorax phantom. Clinically realistic SPECT
        simulations using the voxel-manphantom are carried out to assess
        scatter and attenuation correction. }
}

@Article{slot02:mpi-app,
  author = 	 {J. Slottow and A. Shahriari and M. Stein and X. Chen
                  and C. Thomas and P. B. Ender},
  title = 	 {Instrumenting and tuning {dataView} - a networked
                  application for navigating through large scientific
                  datasets}, 
  journal = 	 {Software-Practice \& Experience},
  year = 	 2002,
  volume =	 32,
  number =	 2,
  pages =	 {165--190},
  month =	 FEB,
  abstract = {This paper describes how we instrumented and tuned the code for
        improved performance in a networked environment. We report on
        how we measured network performance, first by inducing network
        delay and then by running the dataView client component in
        Washington DC and the compute components in Los Angeles. We
        report on the effect that tile size, level of detail, and client
        CPU speed have on performance. We analyze what happens when the
        geometry computation is performed in parallel using MPI (Message
        Passing Interface) vs. in serial, and discuss the effect on
        performance of adding additional computational nodes. }
}

@Article{shan02:mpi-openmp,
  author = 	 {H. Z. Shan and J. P. Singh and L. Oliker and R. Biswas},
  title = 	 {A comparison of three programming models for
                  adaptive applications on the {Origin2000}},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2002,
  volume =	 62,
  number =	 2,
  pages =	 {241--266},
  month =	 FEB,
  abstract = {Adaptive applications have computational workloads and
        communication patterns that change unpredictably at runtime,
        requiring dynamic load balancing to achieve scalable performance
        on parallel machines. Efficient parallel implementations of such
        adaptive applications is therefore a challenging task.In this
        paper, we compare the performance of and the programming effort
        required for two major classes of adaptive applications under
        three leading parallel programming models on an SGI Origin2000
        system, a machine that supports all three models efficiently.
        Results indicate that the three models deliver comparable
        performance; however, the implementations differ significantly
        beyond merely using explicit messages versus implicit
        loads/stores even though the basic parallel algorithms are
        similar. Compared with the message-passing (using MPI) and SHMEM
        programming models, the cache-coherent shared address space
        (CC-SAS) model provides substantial ease of programming at both
        the conceptual and program orchestration levels, often
        accompanied by performance gains. However, CC-SAS currently has
        portability limitationsand may suffer from poor spatial locality
        of physically distributed shareddata on large numbers of
        processors.}
}

@Article{tan02:mpi-app,
  author = 	 {C. J. K. Tan},
  title = 	 {Solving systems of linear equations with relaxed
                  {M}onte {C}arlo method}, 
  journal = 	 {Journal of Supercomputing},
  year = 	 2002,
  volume =	 22,
  number =	 1,
  pages =	 {111--123},
  month =	 MAY,
  abstract = {The problem of solving systems of linear algebraic equations by
        parallel Monte Carlo numerical methods is considered. A parallel
        Monte Carlo method with relaxation is presented. This is a
        report of a research in progress, showing the effectiveness of
        this algorithm. Theoretical justification of thisalgorithm and
        numerical experiments are presented. The algorithms were
        implemented on a cluster of workstations using MPI.}
}
 
@Article{chen01:mpi-app,
  author = 	 {D. Chen and T. Aoki and N. Homma and T. Higuchi},
  title = 	 {Pragmatic method for the design of fast
                  constant-coefficient combinational multipliers},
  journal = 	 {IEEE Proceedings-Computers and Digital Techniques},
  year = 	 2001,
  volume =	 148,
  number =	 6,
  pages =	 {196--206},
  month =	 NOV,
  abstract = {To characterise and analyse the performance of evolutionary
        graph generation (EGG) on a cluster of PCs. a parallel version
        of the EGG system, called the distributed EGG (DEGG) system. has
        been developed using a message-passing interface (MPI). To
        demonstrate the capability of DEGG, it is applied to find the
        optimal design of various multipliers. Experimental results
        substantially clarify that the DEGG system consistently performs
        better than the EGG system. Moreover, the ability and solution
        quality of the DEGG system'ssearch can be further enhanced by
        the use of the self-adaptation mechanismof operator
        probabilities.}
}

@Article{marc02:mpi-app,
  author = 	 {C. D. Marcos and P. Barge and R. D. Marcos},
  title = 	 {Dust dynamics in protoplanetary disks: Parallel
                  computing with {PVM}}, 
  journal = 	 {Journal of Computational Physics},
  year = 	 2002,
  volume =	 176,
  number =	 2,
  pages =	 {276--294},
  month =	 MAR,
  abstract = {We describe a parallel version of our high-order-accuracy
        particle-mesh code for the simulation of collisionless
        protoplanetary disks. We use this code to carry out a massively
        parallel, two-dimensional. time-dependent. numerical simulation,
        which includes dust Particles, to study the potential roleof
        large-scale, gaseous vortices in protoplanetary disks. This
        noncollisional problem is easy to parallelize oil
        message-passing multicomputer architectures. We performed file
        simulations oil a cache-coherent nonuniform memory access Origin
        2000 machine. using both the parallel virtual machine (PVM) and
        message-passing interface (NIPI) message-passing libraries. Our
        performance analysis suggests that. for our problem, PVM is
        about 25\% faster than MPI. Using PVM and NIPI Made it possible
        to reduce CPU little and increase code performance. This allows
        for simulations with U large number of particle, (N similar to
        10(5)-10(6)) in reasonable CPU times, The performances of our
        implementation of the parallel code on an Origin 2000
        supercomputer are presented and discussed. Them exhibit very
        good speedup behavior and low load unbalancing. Our results
        confirm that giant gaseous vortices can play a dominant role in
        giant planet formation.}
}

@Article{ozyo02:mpi-app,
  author = 	 {Y. Ozyoruk},
  title = 	 {Parallel computation of forward radiated noise of
                  ducted fans with acoustic treatment},
  journal = 	 {AIAA Journal},
  year = 	 2002,
  volume =	 40,
  number =	 3,
  pages =	 {450--455},
  month =	 MAR,
  abstract = {Forward radiated noise of ducted fans is computed numerically on
        parallel processors solving the three-dimensional,
        time-dependent Euler equations in body-conformed coordinates
        with a fourth-order-accurate, finite-difference,Runge-Kutta
        time-integration scheme. Sound attenuation effects of inlet wall
        acoustic treatment are included in computations employing a
        time-discrete form of the standard impedance condition. A
        distributed computing approach with domain decomposition is used
        for integrating the equations in parallel using the message
        passing interface library routines. The abilities of the method
        are demonstrated with hard- and soft-wall simulations of the
        JT15D inlet, including flow effects.}
}

@InProceedings{Jones97,
  author =       "Chris R. Jones and Ambuj K. Singh and Divyakant
                 Agrawal",
  title =        "Low Latency {MPI} for Meiko {CS}/2 and {ATM}
                 Clusters",
  booktitle =    "Proceedings of the 11th International Parallel
                 Processing Symposium (IPPS'97)",
  publisher =    "The Institute of Electrical and Electronics
                 Engineers",
  address =      "Geneva, Switzerland",
  month =        apr,
  year =         "1997",
  keywords =     "CD-ROM, I/O and Message Passing,",
  abstract =     "Contains a good overview of existing MPI
                 implementations. Uses a Direct Memory Access method. In
                 order to minimize latency: overlap the transfer of data
                 and send envelope. And this only if the message size is
                 above a certain threshold. First, sending and match
                 envelopes, then DMA.",
}

@InProceedings{Dowd96,
  author =       "P. W. Dowd and T. M. Carrozzi and F. A. Pellegrino and
                 A. X. Chen",
  title =        "Native {ATM} Application Programmer Interface Testbed
                 for Cluster-Based Computing",
  booktitle =    "Proc. 10th Int. Parallel Processing Symp. (IPPS'96)
                 CD-ROM",
  publisher =    "IEEE",
  address =      "Honolulu, HA",
  month =        apr,
  year =         "1996",
  keywords =     "Clusters and Domain Decomposition,",
}

@Article{Cotronis:1998:DMA,
  author =       "Y. Cotronis",
  title =        "Developing Message-Passing Applications on {MPICH}
                 under Ensemble",
  journal =      "Lecture Notes in Computer Science",
  volume =       "1497",
  pages =        "145--??",
  year =         "1998",
  coden =        "LNCSD9",
  ISSN =         "0302-9743",
  bibdate =      "Tue Jan 5 08:21:58 MST 1999",
  acknowledgement = ack-nhfb,
}

@InProceedings{Roy:2000:MGQ,
  author =       "Alain J. Roy and Ian Foster and William Gropp and
                 Nicholas Karonis and Volker Sander and Brian Toonen",
  title =        "{MPICH-GQ}: Quality-of-Service for Message Passing
                 Programs",
  editor =       "{ACM}",
  booktitle =    "{SC2000}: High Performance Networking and Computing.
                 Dallas Convention Center, Dallas, {TX}, {USA}, November
                 4--10, 2000",
  publisher =    "ACM Press and IEEE Computer Society Press",
  address =      "New York, NY 10036, USA and 1109 Spring Street, Suite
                 300, Silver Spring, MD 20910, USA",
  pages =        "54--54",
  year =         "2000",
  bibdate =      "Mon Feb 12 11:57:43 2001",
  url =          "http://www.sc2000.org/proceedings/techpapr/papers/pap234.pdf",
  acknowledgement = ack-nhfb,
}


@InProceedings{IPDPS01*51,
  author =       "Olivier Aumage and Guillaume Mercier and Raymond
                 Namyst",
  title =        "{MPICH/Madeleine}: a True {Multi-Protocol} {MPI} for
                 High Performance Networks",
  pages =        "51--51",
  booktitle =    "Proceedings of the 15th International Parallel \&
                 Distributed Processing Symposium ({IPDPS}-01)",
  month =        apr # " ~23--27",
  publisher =    "IEEE Computer Society",
  address =      "Los Alamitos, CA",
  year =         "2001",
}

@InProceedings{LINUX-00*353,
  author =       "Hong Ong and Paul A. Farrell",
  title =        "Performance Comparison of {LAM/MPI}, {MPICH}, and
                 {MVICH} on a Linux Cluster Connected by a Gigabit
                 Ethernet Network",
  pages =        "353--362",
  booktitle =    "Proceedings of the 4th Annual Showcase \& Conference
                 ({LINUX}-00)",
  month =        oct # " ~10--14",
  publisher =    "The USENIX Association",
  address =      "Berkeley, CA",
  year =         "2000",
}

@Article{Gropp:1997:SMC,
  author =       "W. Gropp and E. Lusk",
  title =        "Sowing {MPICH}: {A} Case Study in the Dissemination of
                 a Portable Environment for Parallel Scientific
                 Computing",
  journal =      "The International Journal of Supercomputer
                 Applications and High Performance Computing",
  volume =       "11",
  number =       "2",
  pages =        "103--114",
  month =        "Summer",
  year =         "1997",
  coden =        "IJSCFG",
  ISSN =         "1078-3482",
  bibdate =      "Thu Jun 26 18:17:48 1997",
  acknowledgement = ack-nhfb,
}

@Article{Gropp97,
  author =       "William Gropp and Ewing Lusk",
  title =        "A high-performance {MPI} implementation on a
                 shared-memory vector supercomputer",
  journal =      "Parallel Computing",
  volume =       "22",
  number =       "11",
  pages =        "1513--1526",
  month =        jan,
  year =         "1997",
  keywords =     "practical aspects/experiences; message-passing
                 interface; shared memory multiprocessor; NEC SX-4;
                 MPICH; performance; implementation;",
}

@InProceedings{Foster98a,
  author =       "Ian Foster and Nicholas T. Karonis",
  title =        "A Grid-Enabled {MPI}: Message Passing in Heterogeneous
                 Distributed Computing Systems",
  booktitle =    "Proceedings of Supercomputing'98 (CD-ROM)",
  publisher =    "ACM SIGARCH and IEEE",
  address =      "Orlando, FL",
  month =        nov,
  year =         "1998",
  keywords =     "metacomputing, Message Passing Interface, MPI, Globus,
                 computational grids, metacomputing, MPICH,",
  abstract =     "Application development for high-performance
                 distributed computing systems, or computational grids
                 as they are sometimes called, requires ``grid-enabled''
                 tools that hide mundane aspects of the heterogeneous
                 grid environment without compromising performance. As
                 part of an investigation of these issues, we have
                 developed MPICH-G, a grid-enabled implementation of the
                 Message Passing Interface (MPI) that allows a user to
                 run MPI programs across multiple computers at different
                 sites using the same commands that would be used on a
                 parallel computer. This library extends the Argonne
                 MPICH implementation of MPI to use services provided by
                 the Globus grid toolkit. In this paper, we describe the
                 MPICH-G implementation and present preliminary
                 performance results.",
  note =         "Argonne National Laboratory",
}

@TechReport{Nog96,
  author =       "Saurab Nog and David Kotz",
  title =        "A Performance Comparison of {TCP}/{IP} and {MPI} on
                 {FDDI}, Fast Ethernet, and Ethernet",
  institution =  "Dartmouth",
  number =       "PCS-TR95-273",
  month =        jan,
  year =         "1996",
  keywords =     "latency / bandwidth measurements for MPI/Ethernet,",
  url =          "http://www.cs.dartmouth.edu/reports/abstracts/PCS-TR95-273.html",
  abstract =     "Communication is a very important factor affecting
                 distributed applications. Getting a close handle on
                 network performance (both bandwidth and latency) is
                 thus crucial to understanding overall application
                 performance. We benchmarked some of the metrics of
                 network performance using two sets of experiments,
                 namely roundtrip and datahose. The tests were designed
                 to measure a combination of network latency, bandwidth,
                 and contention. We repeated the tests for two protocols
                 (TCP/IP and MPI) and three networks (100 Mbit FDDI
                 (Fiber Distributed Data Interface), 100 Mbit Fast
                 Ethernet, and 10 Mbit Ethernet). The performance
                 results provided interesting insights into the behavior
                 of these networks under different load conditions and
                 the software overheads associated with an MPI
                 implementation (MPICH). This document presents details
                 about the experiments, their results, and our analysis
                 of the performance.\par Revised on 1/8/96 to emphasize
                 our use of a particular MPI implementation, MPICH.",
}

@TechReport{ncstrl.cornell.tc//96-239,
  type =         "Technical Report",
  number =       "96-239",
  title =        "Multi{MATLAB}: {MATLAB} on Multiple Processors",
  language =     "English",
  month =        may,
  notes =        "PostScript",
  pages =        "16",
  year =         "1996",
  bibdate =      "May 30, 1996",
  author =       "Anne E. Trefethen and Vijay S. Menon and Chi-Chao
                 Chang and Grezgorz J. Czajkowski and Chris Myers and
                 Lloyd N. Trefethen",
  abstract =     "MATLAB(R), a commercial product of The MathWorks,
                 Inc., has become one of the principal languages of
                 desktop scientific computing. A system is described
                 that enables one to run MATLAB conveniently on multiple
                 processors. Using short, MATLAB-style commands like
                 Eval, Send, Recv, Bcast, Min, and Sum, the user
                 operating within one MATLAB session can start various
                 processes in a fashion that maintains MATLAB's
                 traditional user-friendliness. Multi-processor graphics
                 is also supported. The system currently runs under
                 MPICH on an IBM SP2 or a network of Unix workstations,
                 and extensions are planned to networks of PCs.
                 MultiMATLAB is potentially useful for education in
                 parallel programming, for prototyping parallel
                 algorithms, and for fast and convenient execution of
                 easily parallelizable numerical computations on
                 multiple processors. <P> <B>Keywords:</B> MATLAB,
                 MultiMATLAB, SP2, message passing, MPI, MPICH",
  institution =  "Cornell Theory Center",
}

@InProceedings{Bhandarkar:1996:MPM,
  author =       "M. A. Bhandarkar and L. V. Kale",
  title =        "{MICE}: a prototype {MPI} implementation in {Converse}
                 environment",
  editor =       "{IEEE}",
  booktitle =    "Proceedings. Second {MPI} Developer's Conference:
                 Notre Dame, {IN}, {USA}, 1--2 July 1996",
  publisher =    "IEEE Computer Society Press",
  address =      "1109 Spring Street, Suite 300, Silver Spring, MD
                 20910, USA",
  year =         "1996",
  ISBN =         "0-8186-7533-0",
  pages =        "26--31",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  acknowledgement = ack-nhfb,
  classification = "C6110P (Parallel programming); C6115 (Programming
                 support); C6150E (General utility programs); C6150N
                 (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Illinois Univ., Urbana, IL,
                 USA",
  keywords =     "Abstract Device Interface; application program
                 interfaces; communication; computations; Converse
                 interoperable parallel programming environment; message
                 managers; message passing; MICE; MPI modules; MPICH;
                 multi-threaded MPI programs; open systems; parallel
                 programming; programming environments; prototype MPI
                 implementation; public-domain MPI implementation; PVM
                 interoperation; thread objects; utility programs",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@TechReport{UTEXAS_CS//CS-TR-95-22,
  year =         "1995",
  type =         "Technical Report",
  number =       "CS-TR-95-22",
  institution =  "University of Texas, Austin",
  title =        "Fast Collective Communication Libraries, Please",
  bibdate =      "November 24, 98",
  url =          "ftp://ftp.cs.utexas.edu/pub/techreports/tr95-22.ps.Z",
  author =       "Prasenjit Mitra and David Payne and Lance Shuler and
                 Robert van de Geijn and Jerrell Watts",
  abstract =     "It has been recognized that many parallel numerical
                 algorithms can be effectively implemented by
                 formulating the required communication as collective
                 communications. Nonetheless, the efficiency of such
                 communications has been suboptimal in many
                 communication library implementations. In this paper,
                 we give a brief overview of techniques that can be used
                 to implement a high performance collective
                 communication library, the iCC library, developed for
                 the Intel family of parallel supercomputers as part of
                 the InterCom project at the University of Texas at
                 Austin. We compare the achieved performance on the
                 Intel Paragon to those of three widely available
                 libraries: Intel's NX collective communication library,
                 the MPICH Message Passing Interface (MPI)
                 implementation developed at Argonne and Mississippi
                 State University and a Basic Linear Algebra
                 Communication Subprograms (BLACS) implementation,
                 developed at the University of Tennessee.",
  month =        jun # " 1,",
}

@InProceedings{Skjellum:1996:TTM,
  author =       "A. Skjellum and B. Protopopov and S. Hebert",
  title =        "A thread taxonomy for {MPI}",
  editor =       "{IEEE}",
  booktitle =    "Proceedings. Second {MPI} Developer's Conference:
                 Notre Dame, {IN}, {USA}, 1--2 July 1996",
  publisher =    "IEEE Computer Society Press",
  address =      "1109 Spring Street, Suite 300, Silver Spring, MD
                 20910, USA",
  year =         "1996",
  ISBN =         "0-8186-7533-0",
  pages =        "50--57",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  acknowledgement = ack-nhfb,
  classification = "C6110B (Software engineering techniques); C6110F
                 (Formal methods); C6150E (General utility programs);
                 C6150J (Operating systems); C6150N (Distributed systems
                 software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Dept. of Comput. Sci., Mississippi State Univ., MS,
                 USA",
  keywords =     "API extensions; application program interfaces;
                 Channel Device; computational unit; fine-grain
                 concurrency; formal specification; message passing;
                 minimal portable thread management; MPI; MPICH;
                 multi-threaded thread-safe ADI; non-thread-safe MPI
                 call semantics; resource container; software
                 portability; synchronisation; synchronization
                 mechanisms; thread models; thread safety; thread
                 taxonomy; user-level mechanism; utility programs;
                 Windows NT version",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@Article{Foster:1997:MMC,
  author =       "Ian Foster and Jonathan Geisler and Carl Kesselman and
                 Steven Tuecke",
  title =        "Managing Multiple Communication Methods in
                 High-Performance Networked Computing Systems",
  journal =      "Journal of Parallel and Distributed Computing",
  volume =       "40",
  number =       "1",
  pages =        "35--48",
  day =          "10",
  month =        jan,
  year =         "1997",
  coden =        "JPDCER",
  ISSN =         "0743-7315",
  bibdate =      "Thu Mar 9 09:19:01 MST 2000",
  url =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/pdf;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/ref",
  acknowledgement = ack-nhfb,
  classification = "B6150M (Protocols); B6210L (Computer
                 communications); C5440 (Multiprocessing systems); C5470
                 (Performance evaluation and testing); C5640
                 (Protocols); C5670 (Network performance)",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  doi =          "10.1006/jpdc.1996.1266",
  keywords =     "Argonne MPICH library; computer networks; computing
                 systems; criteria; heterogeneous networked environment;
                 high-performance networked; message passing; message
                 passing interface; multimethod communication; multiple
                 communication methods; multithreaded runtime system;
                 networked computing environments; Nexus; Nexus-based
                 MPI implementation; performance characteristics;
                 performance evaluation; protocols; remote service
                 request mechanisms; transport mechanisms;
                 user-specified selection",
  treatment =    "P Practical",
}

@InProceedings{Foster:1996:MIW,
  author =       "I. Foster and J. Geisler and S. Tuecke",
  title =        "{MPI} on the {I-WAY}: a wide-area, multimethod
                 implementation of the {Message Passing Interface}",
  editor =       "{IEEE}",
  booktitle =    "Proceedings. Second {MPI} Developer's Conference:
                 Notre Dame, {IN}, {USA}, 1--2 July 1996",
  publisher =    "IEEE Computer Society Press",
  address =      "1109 Spring Street, Suite 300, Silver Spring, MD
                 20910, USA",
  year =         "1996",
  ISBN =         "0-8186-7533-0",
  pages =        "10--17",
  bibdate =      "Sat Apr 19 16:34:54 MDT 1997",
  acknowledgement = ack-nhfb,
  classification = "C5620W (Other computer networks); C6110B (Software
                 engineering techniques); C6115 (Programming support);
                 C6130S (Data security); C6150E (General utility
                 programs); C6150N (Distributed systems software)",
  conftitle =    "Proceedings. Second MPI Developer's Conference",
  corpsource =   "Argonne Nat. Lab., IL, USA",
  keywords =     "application program interfaces; authentication;
                 automatic configuration mechanisms; communication
                 mechanisms; geographically distributed computing
                 resources; geographically distributed database
                 resources; geographically distributed graphics
                 resources; geographically distributed networking;
                 heterogeneous systems; high-speed wide-area networks;
                 I-WAY distributed- computing experiment; message
                 authentication; message passing; Message Passing
                 Interface; MPICH; Nexus multithreaded runtime system;
                 parallel programming; portable high-performance
                 programming model; process creation; programming
                 environments; software environment; software libraries;
                 utility programs; wide area networks",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Distributed
                 Process",
  treatment =    "P Practical",
}

@InProceedings{EVL-1999-99,
  year =         "1999",
  title =        "Numerical Relativity in a Distributed Environment",
  author =       "W. Benger and I. Foster and J. Novotny and E. Seidel
                 and J. Shalf and W. Smith and P. Walker",
  url =          "http://visinfo.zib.de/EVlib/Show?EVL-1999-99",
  abstract =     "The Cactus parallel simulation framework provides a
                 modular and extensible set of components for solving
                 relativity problems on parallel computers. In recent
                 work, we have investigated techniques that would enable
                 the execution of Cactus applications in wide area
                 {"}computational grid{"} environments. In a first
                 study, we investigated the feasibility of distributing
                 a single simulation across multiple supercomputers,
                 while in a second we studied techniques for reducing
                 communication costs associated with remote
                 visualization and steering. Distributed simluation was
                 achieved by using MPICH-G, an imlementation of the
                 Message Passing Interface standard that uses mechanisms
                 provided by the Globus grid toolkit to enable wide area
                 execution. Experiments were performed across SGI
                 Origins and Cray T3Es with geographical seperations
                 ranging from hundreds of thousands of kilometres. Total
                 execution time when distributed increased by between
                 18\% and 133\%, depending on configuration. We view these
                 results as encouraging as they were obtained with
                 essentially no specialized algorithmic structures in
                 the Cactus application. Work on remote visualization
                 focused on the development of a Cactus module that
                 computes isosurfaces inline with numerical relativity
                 calculations. Experiments demonstrated that this
                 technique can reduce network bandwidth requirements by
                 a factor ranging from 2.5 to 114, depending on the
                 naturer of the problem.",
  month =        mar,
  booktitle =    "Proceedings of the Ninth SIAM Conference on Parallel
                 Processing for Scientific Computing",
}

@Article{Gropp:1996:HPP,
  author =       "William Gropp and Ewing Lusk and Nathan Doss and
                 Anthony Skjellum",
  title =        "High-performance, portable implementation of the {MPI}
                 {Message Passing Interface Standard}",
  journal =      "Parallel Computing",
  volume =       "22",
  number =       "6",
  pages =        "789--828",
  day =          "20",
  month =        sep,
  year =         "1996",
  coden =        "PACOEJ",
  ISSN =         "0167-8191",
  bibdate =      "Fri Aug 6 10:15:01 MDT 1999",
  url =          "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1996&;volume=22&issue=6&aid=1075",
  acknowledgement = ack-nhfb,
  affiliation =  "Argonne Natl Lab",
  affiliationaddress = "Argonne, IL, USA",
  classification = "722.2; 722.4; 723; 723.1; 723.2; 902.2; C6110B
                 (Software engineering techniques); C6110P (Parallel
                 programming); C6115 (Programming support); C6150N
                 (Distributed systems software)",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  journalabr =   "Parallel Comput",
  keywords =     "applications; Computer programming; Computer software
                 portability; Data communication systems; design goal;
                 distribution; environments; free; future developments;
                 high-performance portable implementation; Interfaces
                 (computer); library writers; message passing; Message
                 passing interface; MPI message; MPI-2; MPICH; parallel
                 computer vendors; Parallel processing systems; parallel
                 programming; Parallel programming environment; passing
                 interface standard; portable parallel programming
                 environment; programming; project management; software
                 libraries; software performance evaluation; software
                 portability; software standards; software tools;
                 specialists; specification; standard library;
                 Standards",
  treatment =    "P Practical",
}

@InProceedings{Husbands98,
  author =       "Parry J. Husbands and James C. Hoe",
  title =        "{MPI}-Star{T}: Delivering Network Performance to
                 Numerical Applications",
  booktitle =    "Proceedings of Supercomputing'98 (CD-ROM)",
  publisher =    "ACM SIGARCH and IEEE",
  address =      "Orlando, FL",
  month =        nov,
  year =         "1998",
  keywords =     "networks, MPI, MPICH, MITMatlab, StarT-X, performance,
                 clustering, SMP,",
  abstract =     "We describe an MPI implementation for a cluster of
                 SMPs interconnected by a high-performance interconnect.
                 This work is a collaboration between a numerical
                 applications programmer and a cluster interconnect
                 architect. The collaboration started with the modest
                 goal of satisfying the communication needs of a
                 specific numerical application, MITMatlab. However, by
                 supporting the MPI standard MPI-StarT readily extends
                 support to a host of applications. MPI-StarT is derived
                 from MPICH by developing a custom implementation of the
                 Channel Interface. Some changes in MPICH's ADI and
                 Protocol Layers are also necessary for correct and
                 optimal operation.\par MPI-StarT relies on the host
                 SMPs' shared memory mechanism for intra-SMP
                 communication. Inter-SMP communication is supported
                 through StarT-X. The StarT-X NIU allows a cluster of
                 PCI-equipped host platforms to communicate over the
                 Arctic Switch Fabric. Currently, StarT-X is utilized by
                 a cluster of SUN E5000 SMPs as well as a cluster of
                 Intel Pentium-II workstations. On a SUN E5000 with
                 StarT-X, a processor can send and receive a 64-byte
                 message in less than 0.4 and 3.5 usec respectively and
                 incur less than 5.6 usec user-to-user one-way latency.
                 StarT-X's remote memory-to-memory DMA mechanism can
                 transfer large data blocks at 60 MByte/sec between SUN
                 E5000s.\par This paper outlines our effort to preserve
                 and deliver this level of communication performance
                 through MPI-StarT to user applications. We have studied
                 the requirements of MITMatlab and the capabilities of
                 StarT-X and have formulated an implementation strategy
                 for the Channel Interface. In this paper, we discuss
                 some performance and correctness issues and their
                 resolutions in MPI-StarT. The correctness issues range
                 from the handling of arbitrarily large message sizes to
                 deadlock-free support of nonblocking MPI operations.
                 Performance optimizations include a shared-memory-based
                 transport mechanism for intra-SMP communication and a
                 broadcast mechanism that is aware of the performance
                 difference between intra-SMP and the slower inter-SMP
                 communication.\par We characterize the performance of
                 MPI-StarT on a cluster of SUN E5000s. On SUN E5000s,
                 MPI processes within the same SMP can communicate at
                 over 150 MByte/sec using shared memory. When
                 communicating between SMPs over StarT-X, MPI-StarT has
                 a peak bandwidth of 56 MByte/sec. While fine-tuning of
                 MPI-StarT is ongoing, we demonstrate that MPI-StarT is
                 effective in enabling the speedup of MITMatlab on a
                 cluster of SMPs by reporting on the performance of some
                 representative numerical operations.",
  note =         "Massachusetts Institute of Technology",
}

@TechReport{ncstrl.cornell.tc//95-228,
  type =         "Technical Report",
  number =       "95-228",
  title =        "{ARCH}, An Object-Oriented Library for Asynchronous
                 and Loosely Synchronous System Programming",
  language =     "English",
  month =        dec,
  notes =        "PostScript",
  pages =        "144",
  year =         "1995",
  bibdate =      "December 28, 1995",
  author =       "Jean-Marc Adamo",
  abstract =     "ARCH is a C++-based library for asynchronous and
                 loosely synchronous system programming. The current
                 version offers a set of programming constructs that are
                 outlined below: <UL> <LI>Threads. The construct is
                 presented as a class from which the user can derive his
                 own classes. The class encapsulates a small set of
                 status variables and offers a set of functions for
                 declaration, initialization, scheduling, priority
                 setting, yielding and stopping. <LI>Processes. A
                 process is a more regular and structured programming
                 construct whose scheduling and termination obey
                 additional synchronization rules. Together with the
                 synchronous point-to-point communication system offered
                 in the library (see below), processes favor a parallel
                 programming style similar to OCCAM's (actually, an
                 extension of it that removes most static features and
                 allows processes to share data). The semantics of this
                 model is well understood and will undoubtedly
                 facilitate the development of correct large
                 asynchronous code. The library has been designed so
                 that the C++ compiler is able to check the static
                 semantics of programs (complete type checking,
                 send-recv correct matching, ...). <LI>Synchronous
                 communication. Threads and processes synchronize and
                 communicate via communication channels. There are four
                 types of communication channels for local or remote
                 synchronization or synchronous point-to-point
                 communication. Inter-processor channels are essentially
                 tools for building virtual topologies. The channel
                 classes offer functions to send to or receive from a
                 channel and get the size of the latest received
                 message. More specialized synchronization-communication
                 tools can be derived from channels. <LI>Global data and
                 pointers. Beside threads, the library offers basic
                 tools for developing distributed data abstractions.
                 Global data are data that can be defined at given
                 locations in the distributed memory but are visible
                 from all processors. Global pointers are a
                 generalization of C++ pointers that allow for
                 addressing global data at any place over the
                 distributed memory. As usual pointers, global pointers
                 are subjected to arithmetic and logic manipulations
                 (incrementation, dereferencing, indexing,
                 comparision...). The library provides basic operators
                 for global data and pointer definition. <LI>Global
                 read/write functions. Global pointer expressions
                 provide global references over the distributed memory
                 that can subsequently be used as arguments to global
                 read/write functions. These functions allow the
                 processors to get access to all global data regardless
                 of their locations over the distributed memory. In
                 their most complete form, the read/write functions
                 operate as remote procedure calls. At the programmer's
                 level, global read/write functions appear as
                 {"}one-sided{"}: a read/write operation is executed on
                 the processor that needs to read/write global data but
                 need not be explicitly handled by the processor
                 associated to the memory holding the data. <LI>Spread
                 and remote Arrays. Two basic distributed data
                 structures have been built in the library. Spread
                 arrays are arrays that have some of their dimensions
                 spread over the distributed memory according to a given
                 policy. Remote arrays are arrays that are defined at a
                 given place in the distributed memory but can be
                 accessed from any other. The spread and remote array
                 classes (SpreadArray and RemoteArray) provide functions
                 for global reference calculation. Global references can
                 subsequently be used as arguments to global read/write
                 functions. One can specialize global pointers to
                 operate on spread or remote arrays. The global pointer
                 class (Star class) offers distinct arithmetic and logic
                 operator sets for unassigned, spread and remote global
                 pointers. </UL> The library encourages parallel code
                 writing in a style that relies on the object-oriented
                 approach: first, build the abstractions that the
                 application at hand relies on; next, make an efficient
                 implementation of the abstraction; and finally, develop
                 the application on top of them. The abstractions can be
                 distributed data types derived from those built in the
                 library (spread and remote arrays: see code of the
                 segmentation algorithm provided with the library) or
                 new distributed types built in the same way or types
                 reused from other applications. This approach should
                 favor parallel code production with many desirable
                 properties such as efficiency, portability,
                 reusability, ... . <P> The library uses MPI as a
                 communication interface. The current implementation
                 runs on the IBM-SP2. Two versions of the library have
                 currently been released. The first one is based on the
                 IBM C++ compiler and MPI library. The second one makes
                 use of the GNU g++ compiler and the MPICH public domain
                 version of MPI. Porting the latter to any parallel
                 machine supporting these two software systems should be
                 straightforward.",
  institution =  "Cornell Theory Center",
}

@InProceedings{ThakurGroLus99,
  author =       "Rajeev Thakur and William Gropp and Ewing Lusk",
  title =        "Data Sieving and Collective {I/O} in {ROMIO}",
  booktitle =    "Proceedings of Frontiers '99: The 7th Symposium on the
                 Frontiers of Massively Parallel Computation",
  address =      "Annapolis, Maryland",
  organization = "IEEE Computer Society",
  month =        feb # " 21--25,",
  year =         "1999",
  pages =        "182--189",
}

@InProceedings{Thakur98,
  author =       "Rajeev S. Thakur and William Gropp and Ewing Lusk",
  title =        "A Case for Using {MPI}'s Derived Datatypes to Improve
                 {I}/{O} Performance",
  booktitle =    "Proceedings of Supercomputing'98 (CD-ROM)",
  publisher =    "ACM SIGARCH and IEEE",
  address =      "Orlando, FL",
  month =        nov,
  year =         "1998",
  keywords =     "parallel I/O, MPI-IO,",
  abstract =     "MPI-IO, the I/O part of the MPI-2 standard, is a
                 promising new interface for parallel I/O. A key feature
                 of MPI-IO is that it allows users to access several
                 noncontiguous pieces of data from a file with a single
                 I/O function call by defining file views with derived
                 datatypes. We explain how critical this feature is for
                 high performance, why users must create and use derived
                 datatypes whenever possible, and how it enables
                 implementations to perform optimizations. In
                 particular, we describe two optimizations our MPI-IO
                 implementation, ROMIO, performs: data sieving and
                 collective I/O. We demonstrate the performance and
                 portability of the approach with performance results on
                 five different parallel machines: HP Exemplar, IBM SP,
                 Intel Paragon, NEC SX-4, and SGI Origin2000.",
  note =         "Argonne National Laboratory",
}

@InProceedings{thakur:mpi-io-implement,
  author =       "Rajeev Thakur and William Gropp and Ewing Lusk",
  title =        "On Implementing {MPI-IO} Portably and with High
                 Performance",
  booktitle =    "Proceedings of the Sixth Workshop on Input/Output in
                 Parallel and Distributed Systems",
  year =         "1999",
  month =        may,
  pages =        "23--32",
  earlier =      "thakur:mpi-io-implement-tr",
  url =          "http://www.mcs.anl.gov/~thakur/papers/mpio-impl.ps",
  keyword =      "parallel I/O, multiprocessor file system interface,
                 pario-bib",
  abstract =     "We discuss the issues involved in implementing MPI-IO
                 portably on multiple machines and file systems and also
                 achieving high performance. One way to implement MPI-IO
                 portably is to implement it on top of the basic Unix
                 I/O functions ({\tt open}, {\tt lseek}, {\tt read},
                 {\tt write}, and {\tt close}), which are themselves
                 portable. We argue that this approach has limitations
                 in both functionality and performance. We instead
                 advocate an implementation approach that combines a
                 large portion of portable code and a small portion of
                 code that is optimized separately for different
                 machines and file systems. We have used such an
                 approach to develop a high-performance, portable MPI-IO
                 implementation, called ROMIO. \par In addition to basic
                 I/O functionality, we consider the issues of supporting
                 other MPI-IO features, such as 64-bit file sizes,
                 noncontiguous accesses, collective I/O, asynchronous
                 I/O, consistency and atomicity semantics, user-supplied
                 hints, shared file pointers, portable data
                 representation, and file preallocation. We describe how
                 we implemented each of these features on various
                 machines and file systems. The machines we consider are
                 the HP Exemplar, IBM SP, Intel Paragon, NEC SX-4, SGI
                 Origin2000, and networks of workstations; and the file
                 systems we consider are HP HFS, IBM PIOFS, Intel PFS,
                 NEC SFS, SGI XFS, NFS, and any general Unix file system
                 (UFS). \par We also present our thoughts on how a file
                 system can be designed to better support MPI-IO. We
                 provide a list of features desired from a file system
                 that would help in implementing MPI-IO correctly and
                 with high performance.",
}

@TechReport{ercim.inria.publications//RR-3461,
  pages =        "36 p.",
  type =         "Technical Report",
  number =       "RR-3461",
  institution =  "Inria, Institut National de Recherche en Informatique
                 et en Automatique",
  title =        "Application Interfaces to {BPFS}: a Basic Parallel
                 File System",
  bibdate =      "July 1, 1998",
  author =       "Robert D. Russell",
  language =     "A",
  abstract =     "Ce rapport d\&eacute;crit trois interfaces de
                 programmation de BPFS, un syst\&egrave;me de fichiers
                 distribu\&eacute; modulaire con\&ccedil;u pour des
                 grappes de stations de travail. Ces interfaces se
                 nomment respectivement API0, CLI et MPI-IO. API0 est la
                 premi\&egrave;re d\&apos;une s\&eacute;rie
                 d\&apos;interfaces d\&apos;acc\&egrave;s \&agrave; BPFS
                 de bas niveau. Cette interface est originale \&agrave;
                 plusieurs titres : elle n\&apos;ob\&eacute;it pas
                 \&agrave; la philosophie classique des fichiers sous
                 UNIX, elle op\&egrave;re en mode bloc et non en mode
                 caract\&egrave;re, elle permet la
                 lecture\&sol;\&eacute;criture de tampons
                 \&laquo;syst\&egrave;mes\&raquo; et de donn\&eacute;es
                 utilisateurs et enfin elle est asynchrone. De plus, des
                 op\&eacute;rations de flux de donn\&eacute;e periodique
                 ansi que le param\&eacute;trage des tampons du
                 cot\&eacute; serveurs par les clients sont disponibles.
                 Bien que l\&apos;interface API0 puisse \&ecirc;tre
                 utilis\&eacute;e directement par n\&apos;importe quelle
                 application, deux interfaces de niveau
                 sup\&eacute;rieur ont \&eacute;t\&eacute;
                 d\&eacute;finies pour une utilisation plus
                 ais\&eacute;e. CLI est une interface s\&apos;appuyant
                 sur API0 qui fournit les primitives standards
                 d\&apos;entr\&eacute;e\&sol;sortie de la
                 \&laquo;libc\&raquo;. Ces primitives acc\&egrave;dent
                 aux fichiers parall\&egrave;les g\&eacute;r\&eacute;s
                 par BPFS et non aux fichiers s\&eacute;quentiels UNIX
                 traditionnels. La troiseme interface est une version de
                 l\&apos;interface ROMIO (elle m\&ecirc;me sous-ensemble
                 de l\&apos;interface standard MPI-IO) implant\&eacute;e
                 au-dessus d\&apos;API0. Cette interface permet donc aux
                 applications d\&eacute;velopp\&eacute;es au-dessus de
                 MPI de s\&apos;executer sans modification au-dessus de
                 BPFS. This report describes three application program
                 interfaces to BPFS, a distributed, modular parallel
                 file system designed for use on clusters of
                 workstations. These interfaces are called API0, CLI,
                 and MPI-IO. API0 is the first of an anticipated series
                 of low-level, experimental client interfaces to BPFS.
                 It is an \&laquo;unconventional\&raquo; interface in
                 many respects: it is not particularly \&laquo;UNIX-
                 like\&raquo;, it is block-oriented rather than byte-
                 oriented, it reads and writes system buffers as well as
                 user-defined data areas, and it is asynchronous. It
                 also provides time-regulated \&laquo;data
                 streaming\&raquo; operations and user-level control of
                 both server-side caching and per-file striping onto
                 disks. Although API0 can be used directly from a user
                 application program, it can also be used
                 \&laquo;under\&raquo; a more conventional interface, as
                 has been done for the next two interfaces. CLI is a
                 \&laquo;C Library Interface\&raquo; implemented on top
                 of API0 that exactly mimics the Standard C I\&sol;O
                 library interface, but accesses parallel files stored
                 by BPFS rather than sequential files stored by the host
                 file system. The third interface is the ROMIO version
                 of the standard MPI-IO interface which has been
                 implemented on top of API0 to support access to BPFS
                 files from parallel programs that use the Message
                 Passing Interface (MPI).",
}


@InProceedings{schmuck02:IO-GPFS,
  author = 	 {Frank Schmuck and Roger Haskin},
  title = 	 {{GPFS}: A Shared-Disk File System for Large
                  Computing Clusters},
  booktitle =	 {First Usenix Conference on File and Storage
                  Technologies (FAST)},
  year =	 2002,
  month =	 JAN,
  annote =	 {Meeting held in Monteray, CA, January 28--30}
}


@Article{adve02:mpi-impl,
  author = 	 {V.S. Adve and R. Bagrodia and E. Deelman and R. Sakellariou},
  title = 	 {Compiler-optimized simulation of large-scale
                  applications on high performance architectures}, 
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2002,
  volume =	 62,
  number =	 3,
  pages =	 {393--426},
  month =	 MAR,
  abstract = {In this paper, we propose and evaluate practical, automatic
        techniques thatexploit compiler analysis to facilitate
        simulation of very large message-passing systems. We use
        compiler techniques and a compiler-synthesized static task graph
        model to identify the subset of the computations whose values
        have no significant effect on the performance of the program,
        and to generate symbolic estimates of the execution times of
        these computations. For programs with regular computation and
        communication patterns, this informationallows us to avoid
        executing or simulating large portions of the computational code
        during the simulation. It also allows us to avoid performing
        someof the message data transfers, while still simulating the
        message performance in detail. We have used these techniques to
        integrate the MPI-Sim parallel simulator at UCLA with the Rice
        dHPF compiler infrastructure. We evaluate the accuracy and
        benefits of these techniques for three standard message-passing
        benchmarks on a wide range of problem and system sizes. The
        optimized simulator has errors of less than 16\% compared with
        direct program measurement in all the cases we studied, and
        typically much smaller errors. Furthermore, it requires factors
        of 5 to 2000 less memory and up to a factor of 10 less time to
        execute than the original simulator. These dramatic savings
        allow us to simulate regular message-passing programs on systems
        and problem sizes 10 to 100 times larger than is possible with
        the original simulator, or other current state-of-the-art
        simulators. }
}


@Article{tana02:mpi-app,
  author = 	 {H. Tanaka and M. Takata and E. Nishibori and K. Kato
                  and T. Iishi and M. Sakata},
  title = 	 {{ENIGMA}: maximum-entropy method program package for
                  huge systems}, 
  journal = 	 {Journal of Applied Crystallography},
  year = 	 2002,
  volume =	 35,
  pages =	 {282--286},
  month =	 APR, 
  abstract = {ENIGMA (Electron and Nuclear Image Generator by Max-ent
        Analysis) is a program package to evaluate three-dimensional
        electron and nuclear density fromX-ray and neutron diffraction
        data by using the maximum-entropy method (MEM). Compared with
        the previous program package MEED, ENIGMA saves computingtime
        and frees memory space at the same time by employing parallel
        data processing. The fast Fourier transformation (FFT) technique
        is also implemented. As a consequence of these improvements, the
        MEM analysis by ENIGMA becomes applicable to huge systems, such
        as proteins and polymers, when the phased structure factors are
        provided. The package is transferable to a wide variety of
        parallel computers, because it is written in Fortran 90 and a
        standard message-passing interface (MPI).}
}

@Article{bosi02:mpi-impl,
  author = 	 {G. Bosilca and G. Fedak and F. Cappello},
  title = 	 {{OVM}: Out-of-order execution parallel virtual machine},
  journal = 	 {Future Generation Computer Systems},
  year = 	 2002,
  volume =	 18,
  number =	 4,
  pages =	 {525--537},
  month =	 MAR, 
  abstract = {High performance computing on parallel architectures currently
        uses different approaches depending on the hardware memory model
        of the architecture, the abstraction level of the programming
        environment and the nature of the application. In this article,
        we introduce an original client-server execution model based on
        RPCs called out-of-order parallel virtual machine (OVM). OVM
        aims to provide three main features: portability through a
        unique memory model, load-balancing using a plug-in support and
        high performance provided by several optimizations. The main
        optimizations are: non-blocking RPCs,data-flow management,
        persistent and non-persistent data, static data set
        distribution, dynamic scheduling and asynchronous global
        operations. We present OVM general architecture and demonstrate
        high performance for regular parallel applications, a parallel
        application with load balancing needs anda parallel application
        with real-time constraints. We firstly compare the performance
        of OVM and MPI for three kernels of the NAS 2.3. Then we
        illustrate the performance capability of OVM for a large
        real-life application that needs a load balancing support called
        AIRES. Finally, we present the performance of a real-time
        version of the PovRay ray-tracer demonstrating the reactiveness
        of OVM.}
} 

@Article{lars02:mpi-app,
  author = 	 {D. J. Larson and J. S. Nasstrom},
  title = 	 {Shared- and distributed-memory parallelization of a
                  {L}agrangian atmospheric dispersion model},
  journal = 	 {Atmospheric Environment},
  year = 	 2002,
  volume =	 36,
  number =	 9,
  pages =	 {1559--1564},
  month =	 MAR,
  abstract = {This paper describes parallelization of a 3-D Lagrangian
        stochastic atmospheric dispersion model using both distributed-
        and shared-memory methods. Shared-memory parallelism is
        implemented through the use of OpenMP compiler directives.
        Distributed-memory parallelism relies on the MPI
        message-passinglibrary. One or both (using MPI for inter-node
        and OpenMP for intra-node communication) of the parallel modes
        can be used depending upon the requirements of the problem and
        the computational platform available. The distributed-memory
        version achieves a nearly linear decrease in execution time as
        the, number of processors is increased. As the number of
        particles per processor is lowered, performance is limited by
        the decrease in work per processor and by the need to produce
        one set of output files. The shared-memory version achieves a
        speedup factor of similar to 1.4 running on machines with four
        processors.}
}

@Article{choe02:mpi-app,
  author = 	 {S. Choe and S. Muroya and A. Nakamura and C. Nonaka
                  and T. Saito and R. Shoji},
  title = 	 {Lattice tool kit in {Fortran90}},
  journal = 	 {Nuclear Physics B-Proceedings Supplements},
  year = 	 2002,
  volume =	 106,
  pages =	 {1037--1039},
  month =	 MAR,
  abstract = {We report a project to provide a set of free source codes for
        lattice QCD. The programs may be used as fundamental blocks when
        one wants to construct his/her own QCD codes. They are written
        in Fortran 90 with use of MODULE, so that algorithms can
        transparently be seen. MPI is used for parallelization. We are
        also constructing a proto-type of QCD-GRID where one can try to
        run the code.}
}

@Article{bach02:mpi-app,
  author = 	 {F. W. Bach and H. Haferkamp and A. Kuhlmeyer and
                  M. Niemeyer}, 
  title = 	 {Monte-Carlo simulation of dislocation networks using
                  the message passing interface},
  journal = 	 {Modelling and Simulation in Materials Science and
      Engineering},
  year = 	 2002,
  volume =	 10,
  number =	 2,
  pages =	 {215--225},
  month =	 MAR,
  abstract = {The dynamic behaviour of one-dimensional lattice defects in
        metallic materials is responsible for most of the metals'
        mechanical properties. This behaviour is determined by all the
        interaction mechanisms of dislocations with other lattice
        defects such as further dislocations, precipitations and grain
        boundaries. A two-dimensional simulation model implementing
        these mechanisms based on a Monte-Carlo method for an N-body
        dislocation system is presented. The influences of temperature
        and external stress can also be taken into account. In this
        model, the different defects will be reduced to a polymorphic
        structure according to the object-oriented paradigm. To improve
        computing performance, the model was parallelized for
        multiprocessor machines in collaboration with the Edinburgh
        Parallel Computing Centre (EPCC) under the European Union TRACS
        programme (Training and Research on Advanced Computing Systems).
        Results are discussed under consideration of different
        parameters.}
}

@Article{bark02:mpi-app,
  author = 	 {K. Barker and N. Chrisochoides and J. Dobbelaere and
                  D. Nave and K. Pingali},
  title = 	 {Data Movement and Control Substrate for parallel
                  adaptive applications}, 
  journal = 	 {Concurrency and Computation-Practice \& Experience},
  year = 	 2002,
  volume =	 14,
  number =	 2,
  pages =	 {77--101},
  month =	 FEB,
  abstract = {In this paper, we present the Data Movement and Control
        Substrate (DMCS), alibrary which implements low-latency
        one-sided communication primitives for use in parallel adaptive
        and irregular applications. DMCS is built on topof low-level,
        vendor-specific communication subsystems such as LAPI (Low-level
        Application Programme Interface) for IBM SP machines, as well as
        on widely available message-passing libraries like MPI for
        clusters of workstations and PCs. DMCS adds a small overhead to
        the communication operations provided by the lower communication
        system. In return, DMCS provides a flexible and easy to
        understand application program interface for one-sided
        communication operations. Furthermore, DMCS Is designed so that
        it can be easily ported and maintained by non-experts.}
}

@Article{lei02:mpi-app,
  author = 	 {W. Lei and H. C. Yin and B. P. Wang and L. S. Tong},
  title = 	 {Optimization of a particle optical system in a
                  mutilprocessor environment},
  journal = 	 {Nuclear Instruments \& Methods in Physics Research Section
      A-Accelerators Spectrometers Detectors and Associated Equipment}, 
  year = 	 2002,
  volume =	 479,
  number =	 {2--3},
  pages =	 {611--617},
  month =	 MAR, 
  abstract = {In the design of a charged particle optical system, many
        geometrical and electric parameters have to be optimized to
        improve the performance characteristics. In every optimization
        cycle, the electromagnetic field and particletrajectories have
        to be calculated. Therefore, the optimization of a charged
        particle optical system is limited by the computer resources
        seriously. Apart from this, numerical errors of calculation may
        also influence the convergence of merit function. This article
        studies how to improve the optimization of charged particle
        optical systems. A new method is used to determine the gradient
        matrix. With this method, the accuracy of the Jacobian matrix
        can be improved. In this paper, the charged particle optical
        system is optimized with a Message Passing Interface (MPI). The
        electromagnetic field, particle trajectories and gradients of
        optimization variables are calculated on networks of
        workstations. Therefore, the speed of optimization has been
        increased largely. It is possible to design a complicated
        charged particle optical system with optimum quality on a MPI
        environment. Finally, an electron gun for a cathode ray tube has
        been optimized on a MPI environment toverify the method proposed
        in this paper.}
}

@Article{khas02:mpi-app,
  author = 	 {S. A. Khashan and D. O. Ogbe and T. M. Jiang},
  title = 	 {Development and optimization of parallel code for
                  large-scale petroleum reservoir simulation},
  journal = 	 {Journal of Canadian Petroleum Technology},
  year = 	 2002,
  volume =	 41,
  number =	 4,
  pages =	 {33--38},
  month =	 APR,
  abstract = {This paper discusses the use of large field-scale reservoir
        simulation to model multiphase fluid flow processes that occur
        in giant oil reservoirs. The goal of large-scale studies is to
        model fluid flow with sufficient details to account for
        reservoir heterogeneity, various reservoir-wellbore
        configurations, and complex fluid-rock interactions. In this
        paper, we developed a black-oil reservoir simulator for
        distributed-memory parallel environment. We ported serial code
        of a black-oil model to the CRAY T3E and IBM SP2 systems. We
        analysed the code and benchmarked the performance. To
        parallelizethe code, we used a domain decomposition algorithm,
        whereby the reservoir is divided into several subdomains, with
        each subdomain assigned to a separate processor element (PE).
        The message-passing interface (MPI) is used to exchange
        information across subdomains. We validated the parallel
        simulatorusing data from the Society of Petroleum Engineers
        comparative solution projects. Because the linear equation
        solver accounts for over 90\% of the CPUtime in a typical
        reservoir simulation run, we evaluated the performance of
        several parallel algorithms in the project, including LSOR,
        Gauss-Siedel (GS) and strongly implicit procedure (SIP). We
        found that the convergence behaviour of the solver depends on
        the number of processors and on the permeability anisotropy of
        the reservoir. For the problems we tested, the SIP algorithms
        provided the best performance. We compared the computational
        efficiency of the parallel code against the serial code using
        models containing up to 350,000 grid blocks in 4-, 8-, 16- 32-,
        64-, and 80-PE environments. The paper discusses programming and
        computational performance issues in large-scale reservoir
        simulation for parallel systems.}
}

 
@Article{alia02a:mpi-app,
  author = 	 {S. Aliabadi and A. Johnson and B. Zellars and
                  A. Abatan and C. Berger},
  title = 	 {Parallel simulation of flows in open channels},
  journal = 	 {Future Generation Computer Systems},
  year = 	 2002,
  volume =	 18,
  number =	 5,
  pages =	 {627--637},
  month =	 APR,
  abstract = {In this project, we apply our advanced free-surface flow solver
        to simulateflow in open channels at supercritical conditions.
        The finite element method is used to discretize the governing
        equations over fixed meshes. The finite element formulations
        have been implemented in parallel using message passing
        interface (MPI) libraries. Linear speed up performance is
        achieved. The computations are carried out for a case study
        involving flow in contraction channel at supercritical
        condition. The numerical results compare very well with
        experimental data.}
}


@Article{berz02:mpi-app,
  author = 	 {P. K. Berzigiyarov and V. A. Zayets and I. Y. Ginzburg and
                  V. F. Razumov and E. F. Sheka},
  title = 	 {{NANOPACK}: Parallel codes for semiempirical quantum
                  chemical calculations of large systems in the sp- and
                  spd-basis}, 
  journal = 	 {International Journal of Quantum Chemistry},
  year = 	 2002,
  volume =	 88,
  number =	 4,
  pages =	 {449--462},
  month =	 JUN,
  abstract = {A parallel implementation of the conventionally used NDDO (MNDO,
        AM1, PM3, CLUSTER-Z1) and modified NDDO-WF (CLUSTER-Z2)
        techniques for semiempirical quantum chemical calculations of
        large molecular systems in the sp- and spd-basis, respectively,
        is described. The atom-pair distribution of data overprocessors
        forms the basis of the parallelization. The technological
        aspects of designing scalable parallel calculations on
        supercomputers (using ScaLAPACK and MPI libraries) are
        discussed. The scaling of individual algorithms and the entire
        package was carried out for model systems with 894, 1920,and
        2014 atomic orbitals. The package speed-up provided by different
        multiprocessor systems involving a cluster of Intel PIII
        processors, Alpha-21264-processor-built machine MBC-1000M, and
        Cray-T3E is analyzed. The effect ofcomputer characteristics on
        the package performance is discussed. }
}

@Article{gosw02:mpi-use,
  author = 	 {D. Goswami and A. Singh and B. R. Preiss},
  title = 	 {From design patterns to parallel architectural skeletons},
  journal = 	 {Journal of Parallel and Distributed Computing},
  year = 	 2002,
  volume =	 62,
  number =	 4,
  pages =	 {669--695},
  month =	 APR,
  abstract = {The concept of design patterns has been extensively studied and
        applied in the context of object-oriented software design.
        Similar ideas are being explored in other areas of computing as
        well. Over the past several years, researchers have been
        experimenting with the feasibility of employing design-patterns
        related concepts in the parallel computing domain. In the past,
        several pattern-based systems have been developed with the
        intention to facilitate faster parallel application development
        through the use of preimplemented and reusable components that
        are based on frequently used parallel computing design patterns.
        However, most of these systems face several serious limitations
        such as limited flexibility, zero extensibility, and the ad
        hoenature of their components. Lack of flexibility in a parallel
        programming system limits a programmer to using only the
        high-level components providedby the system. Lack of
        extensibility here refers to the fact that most of the existing
        pattern-based parallel programming systems come with a set of
        prebuilt patterns integrated into the system. However, the
        system provides no obvious way of increasing the repertoire of
        patterns when need arises. Also, most of these systems do not
        offer any generic view of a parallel computing pattern, a fact
        which may be at the root of several of their shortcomings. This
        research proposes a generic (i.e., pattern- and
        application-independent) model for realizing and using parallel
        design patterns. The term "parallel architectural skeleton" is
        used to represent the set of generic attributes associated with
        a pattern. The Parallel Architectural Skeleton Model (PASM) is
        based on the message-passing paradigm, which makes it
        suitablefor a LAN of workstations and PCs. The model is flexible
        as it allows the intermixing of high-level patterns with
        low-level message-passing primitives. An object-oriented and
        library-based implementation of the model has been completed
        using C++ and MPI, without necessitating any language extension.
        The generic model and the library-based implementation allow new
        patternsto be defined and included into the system. The
        skeleton-library serves asa framework for the systematic,
        hierarchical development of network-oriented parallel
        applications.}
}


@Article{anku02:mpi-app,
  author = 	 {A. L. Ankudinov and C. E. Bouldin and J. J. Rehr and J. Sims
                  and H. Hung},
  title = 	 {Parallel calculation of electron multiple scattering using
                  {L}anczos algorithms},
  journal = 	 {Physical Review B},
  year = 	 2002,
  volume =	 6510,
  number =	 10,
  pages =	 {4107--4107},
  month =	 MAR,
  abstract = {Real space multiple scattering calculations of the electronic
        density of states and x-ray spectra in solids typically scale as
        the cube of the system and basis set size, and hence are highly
        demanding computationally. For example, such x-ray absorption
        near edge structure (XANES) calculations typically require
        clusters of order N-R atoms and s, p, and d states for
        convergence, with N-R between about 10(2)-10(3); for this case
        about 10(2) inversions of 9N(R) x 9N(R) matrices are needed, one
        for each energy point. We discuss here two ways to speed up
        these calculations: (1) message passing interface (MPI) parallel
        processing and (2) fast, Lanczos multiple scattering algorithms.
        Together these algorithms can reduce computation times typically
        by two orders of magnitude. These are both implemented in a
        generalization of the ab initio self-consistent FEFF8 code,
        which thus makes practical XANES calculations in complex systems
        with of order 10(3) atoms. The Lanczos algorithm also yields a
        natural crossover between full and finite-order multiple
        scattering with increasing energy, thus differentiating the
        extended and near-edge regimes.}
}

@Article{gall02:mpi-app,
  author = 	 {J. A. Gallud and J. Garcia-Consuegra and A. Martinez},
  title = 	 {Distributed processing of remotely sensed {Landsat-TM}
                  imagery using {MPI}}, 
  journal = 	 {Cluster Computing},
  year = 	 2002,
  pages =	 {15-22}
}

@Article{alia02:mpi-app,
  author = 	 {S. Aliabadi and A. Abatan and A. Johnson and J. Abedi and
                  Y. Yeboah and K. Bota},
  title = 	 {Stabilized finite element formulation of buoyancy driven
                  incompressible flows},
  journal = 	 {Communications in Numerical Methods in Engineering},
  year = 	 2002,
  volume =	 18,
  number =	 5,
  pages =	 {315--324},
  month =	 MAY, 
  abstract = {Streamline-upwind/Petrov-Galerkin finite element method is
        developed for buoyancy-driven incompressible flows with heat and
        mass transfer. The stabilized finite element formulations are
        implemented in parallel using message passing interface
        libraries. To measure the accuracy of the method, we solvea 2D
        numerical example of natural convection flows at moderate to
        high Rayleigh numbers. The 3D applications include the
        dispersion of smoke from a chimney and within a stadium.}
}

@Article{mo02:mpi-app,
  author = 	 {Z. Y. Mo and J. L. Zhang and Q. D. Cai},
  title = 	 {Dynamic load balancing for short-range parallel molecular
                  dynamics simulations},
  journal = 	 {International Journal of Computer Mathematics},
  year = 	 2002,
  volume =	 79,
  number =	 2,
  pages =	 {165--177},
  month =	 FEB,
  abstract = {The iterative Multilevel Averaging Weight (MAW) algorithm
        presented in paper [1] is modified to solve the dynamic load
        imbalance problems arising fromthe two-dimensional short-range
        parallel molecular dynamics simulations inthis paper. Firstly,
        five types of load balancing models are given which allows
        detailed studies of the algorithm. In particular, it shows that
        for strip decomposition, the number of iteration needs for the
        system to converge from an initially unbalanced state to a well
        balanced state is bounded by2logP, where P is the number of
        processors. This result can permit the algorithm to efficiently
        track fluctuations in the molecular density as the simulation
        progresses, and is much better than that of the Cellular
        AutomatonDiffusion (CAD) scheme presented in paper [2].
        Secondly, we apply MAW algorithm to solve the load imbalance
        problem in the parallel molecular dynamics simulation for higher
        speed wall collisions. At last, the numerical experimental
        results and parallel computing performance with MPI-1.2 under a
        PC-Cluster consists of 64 Pentium-III 500 MHz nodes connected by
        100 Mbps switches are given in this paper.}
}
 

@Article{deit02:parallel-lang,
  author = 	 {S. J. Deitz and B. L. Chamberlain and L. Snyder},
  title = 	 {High-level language support for user-defined reductions},
  journal = 	 {Journal of Supercomputing},
  year = 	 2002,
  volume =	 23,
  number =	 1,
  pages =	 {23--37},
  month =	 AUG,
  abstract = {The optimized handling of reductions on parallel supercomputers
        or clustersof workstations is critical to high performance
        because reductions are common in scientific codes and a
        potential source of bottlenecks. Yet in many high-level
        languages, a mechanism for writing efficient reductions remains
        surprisingly absent. Further, when such mechanisms do exist,
        they often do not provide the flexibility a programmer needs to
        achieve a desirable level of performance. In this paper, we
        present a new language construct for arbitrary reductions that
        lets a programmer achieve a level of performance equal to that
        achievable with the highly flexible, but low-level combination
        of Fortran and MPI. We have implemented this construct in the
        ZPL language and evaluate it in the context of the
        initialization of the NAS MG benchmark. We show a 45 times
        speedup over the same code written in ZPL without thisconstruct.
        In addition, performance on a large number of processors
        surpasses that achieved in the NAS implementation showing that
        our mechanism provides programmers with the needed flexibility.}
}


@Article{mohr02:mpi-openmp,
  author = 	 {B. Mohr and A. D. Malony and S. Shende and F. Wolf},
  title = 	 {Design and prototype of a performance tool interface for
                  {OpenMP}},
  journal = 	 {Journal of Supercomputing},
  year = 	 2002,
  volume =	 23,
  number =	 1,
  pages =	 {105--128},
  month =	 AUG,
  abstract = {This paper proposes a performance tools interface for OpenMP,
        similar in spirit to the MPI profiling interface in its intent
        to define a clear and portable API that makes OpenMP execution
        events visible to runtime performancetools. We present our
        design using a source-level instrumentation approachbased on
        OpenMP directive rewriting. Rules to instrument each directive
        and their combination are applied to generate calls to the
        interface consistent with directive semantics and to pass
        context information (e.g., source code locations) in a portable
        and efficient way. Our proposed OpenMP performance API further
        allows user functions and arbitrary code regions to be marked
        and performance measurement to be controlled using new OpenMP
        directives. To prototype the proposed OpenMP performance
        interface, we have developed compatible performance libraries
        for the Expert automatic event trace analyzer [17, 18] and the
        TAU performance analysis framework [13]. The directive
        instrumentation transformations we define are implemented in a
        source-to-source translation tool called OPARI. Application
        examples are presented for both Expert and TAU to show the
        OpenMP performance interface and OPARI instrumentation tool in
        operation. When used together with the MPI profilinginterface
        (as the examples also demonstrate), our proposed approach
        provides a portable and robust solution to performance analysis
        of OpenMP and mixed-mode (OpenMP+MPI) applications.}
}
 
@Article{nak02:mpi-app,
  author = 	 {K. Nakajima and H. Okuda},
  title = 	 {Parallel iterative solvers for unstructured grids using a
                  directive/{MPI} hybrid programming model for the {GeoFEM}
                  platform on {SMP} cluster},  
  journal = 	 {Concurrency and Computation-Practice \& Experience},
  year = 	 2002,
  volume =	 14,
  number =	 {6--7},
  pages =	 {411-429},
  month =	 {May-June},
  abstract = {In this paper, an efficient parallel iterative method for
        unstructured grids developed by the authors for shared memory
        symmetric multiprocessor (SMP)cluster architectures on the
        GeoFEM platform is presented. The method is based on a
        three-level hybrid parallel programming model, including message
        passing for inter-SMP node communication, loop directives for
        intra-SMP node parallelization and vectorization for each
        processing element (PE). Simple 3D elastic linear problems with
        more than 108 degrees of freedom have been solved by 3 x 3 block
        ICCG(0) with additive Schwarz domain decomposition and
        PDJDS/CM-RCM reordering on 16 SMP nodes of a Hitachi SR8000
        parallel computer, achieving a performance of 20 Gflops. The
        PDJDS/CM-RCM reordering method provides excellent vector and
        parallel performance in SMP nodes, and is essential for
        parallelization of forward/backward substitution in
        IC/ILUfactorization with global data dependency. The method
        developed was also tested on an NEC SX-4 and attained 969 Mflops
        (48.5\% of peak performance) using a single processor. The
        additive Schwarz domain decomposition method provides robustness
        for the GeoFEM parallel iterative solvers with localized
        preconditioning.}
}


@Article{tri02:mpi-app,
  author = 	 {N. Trivedi and J. Bischof and S. Davis and K. Pedretti and
                  T. E. Scheetz and T. A. Braun and C. A. Roberts and
                  N. L. Robinson and V. C. Sheffield and A. B. Soares and T. L
                  Casavant},
  title = 	 {Parallel creation of non-redundant gene indices from partial
                  {mRNA} transcripts},
  journal = 	 {Future Generation Computer Systems},
  year = 	 2002,
  volume =	 18,
  number =	 6,
  pages =	 {863--870},
  month =	 MAY, 
  abstract = {This paper describes the UIcluster software tool, which
        partitions expressed sequence tag (EST) sequences and other
        genetic sequences into "clusters" based on sequence similarity.
        Ideally, each cluster will contain sequences that all represent
        the same gene. UIcluster has been developed over the course of 4
        years to solve this problem efficiently and accurately for large
        data sets consisting of tens or hundreds of thousands of EST
        sequences. The latest version of the application has been
        parallelized using the MPI standard. Both the computation and
        memory requirements of the program can be distributed among
        multiple (possibly distributed) UNIX processes.}
}


@Article{stan02:mpi-model,
  author = 	 {N. Stankovic and K. Zhang},
  title = 	 {A distributed parallel programming framework},
  journal = 	 {IEEE Transactions on Software Engineering},
  year = 	 2002,
  volume =	 28,
  number =	 5,
  pages =	 {478--493},
  month =	 MAY,
  abstract =  {This paper presents Visper, a novel object-oriented framework
        that identifies and enhances common services and programming
        primitives, and implements a generic set of classes applicable
        to multiple programming models in a distributed environment.
        Groups of objects, which can be programmed in a uniform and
        transparent manner, and agent-based distributed system
        management, are also featured in Visper. A prototype system is
        designed and implemented in Java, with a number of visual
        utilities that facilitate program development and portability,
        As a use case, Visper integrates parallel programming in an
        MPI-like message-passing paradigm at a high level with services
        such as checkpointing and fault tolerance at a lower level. The
        paper reports a range of performance evaluation on the prototype
        and compares it to relatedworks.}
}

@Article{ong02:mpi-impl,
  author = 	 {E. Ong},
  title = 	 {MPI ruby: Scripting in a parallel environment},
  journal = 	 {Computing in Science \& Engineering},
  year = 	 2002,
  volume =	 4,
  number =	 4,
  pages =	 {78--82},
  month =	 {July-Aug}
}

@Article{tad02:mpi-app,
  author = 	 {M. Tadjfar and R. Himeno},
  title = 	 {Time-accurate, parallel, multi-zone, multi-block solver to
                  study the human cardio-vascular system},
  journal = 	 {Biorheology},
  year = 	 2002,
  volume =	 39,
  number =	 {3--4},
  pages =	 {379--384},
  abstract = {A parallel, time-accurate flow solver is devised to study the
        human cardio-vascular system. The solver is capable of dealing
        with moving boundaries and moving grids. It is designed to
        handle complex, three-dimensional vascular systems. The
        computational domain is divided into multiple block subdomains.
        At each cross section the plane is divided into twelve sub-zones
        to allow flexibility for handling complex geometries and, if
        needed, appropriate parallel data partitioning. The unsteady,
        three-dimensional, incompressibleNavier-Stokes equations are
        solved numerically. A second-order in time andthird-order upwind
        finite volume method for solving time-accurate incompressible
        flows based on pseudo-compressibility and dual time-stepping
        technique is used. For parallel execution, the flow domain is
        partitioned. Communication between the subdomains of the flow on
        Riken's VPP/700E supercomputeris implemented using MPI
        message-passing library. A series of numerical simulations of
        biologically relevant flows is used to validate this code.}
}


@Article{anon02:mpi-models,
  author = 	 {Anonymous},
  title = 	 {Message passing},
  journal = 	 {Parallel and Distributed Computing: A Survey of Models,
      Paradigms and Approaches},
  year = 	 2002,
  pages =	 {95--109}
}

 
@Article{sei02:mpi-tools,
  author = 	 {F. J. Seinstra and D. Koelma},
  title = 	 {{P-3PC}: {A} point-to-point communication model for
                  automatic and optimal decomposition of regular domain
                  problems}, 
  journal = 	 {IEEE Transactions on Parallel and Distributed Systems},
  year = 	 2002,
  volume =	 13,
  number =	 7,
  pages =	 {758--768},
  month =	 JUL,
  abstract = {One of the most fundamental problems automatic parallelization
        tools are confronted with is to find an optimal domain
        decomposition for a given application. For regular domain
        problems (such as simple matrix manipulations), this task may
        seem trivial. However, communication costs in message passing
        programs often significantly depend on the memory layout of data
        blocks to be transmitted. As a consequence, straightforward
        domain decompositions maybe nonoptimal. In this paper, we
        introduce a new point-to-point communication model (called
        P-3PC, or the "Parameterized model based on the Three Paths of
        Communication") that is specifically designed to overcome this
        problem. In comparison with related models (e.g., LogGP) P-3PC
        is similar in complexity, but more accurate in many situations.
        Although the model is aimed at MPI's standard point-to-point
        operations, it is applicable to similar message passing
        definitions as well. The effectiveness of the model is tested in
        a framework for automatic parallelization of low level image
        processing applications. Experiments are performed on two
        Beowulf-type systems, each having a different interconnection
        network, and a different MPI implementation. Results show that,
        where other models frequently fail, P-3PC correctly predicts the
        communication costs related to any type of domain decomposition.}
}

@Article{vit02:mpi-app,
  author = 	 {J. E. Vitela and U. R. Hanebutte and J. L. Gordillo and
                  L. M. Cortina},
  title = 	 {Comparative performance study of parallel programming models
                  in a neural network training code},
  journal = 	 {International Journal of Modern Physics C},
  year = 	 2002,
  volume =	 13,
  number =	 4,
  pages =	 {429--452},
  month =	 MAY,
  abstract = {This paper discusses the performance studies of a coarse grained
        parallel neural network training code for control of nonlinear
        dynamical systems, implemented in the shared memory and message
        passing parallel programming environments OpenMP and MPI,
        respectively. In addition, these codes are compared to an
        implementation utilizing SHMEM the native data passing SGI/Cray
        environment for parallel programming. The multiprocessor
        platform used in the study is a SGI/Cray Origin 2000 with up to
        32 processors, which supports all these programming models
        efficiently. The dynamical system used in this study is a
        nonlinear OD model of a thermonuclear fusion reactor with the
        EDA-ITER design parameters. The results show that OpenMP
        outperforms the othertwo environments when large number of
        processors are involved, while yielding a similar or a slightly
        poorer behavior for small number of processors.As expected the
        native SGI/Cray environment outperforms MPI for the entirerange
        of processors used. Reasons for the observed performance are
        given. The parallel efficiency of the code is always greater
        than 60\% regardless of the parallel environment for the range of
        processors used in this study.}
}

@Article{wan02:mpi-app,
  author = 	 {Y. Wang and  A. M. Cuitino},
  title = 	 {Full-field measurements of heterogeneous deformation
                  patterns on polymeric foams using digital image
                  correlation}, 
  journal = 	 {International Journal of Solids and Structures},
  year = 	 2002,
  volume =	 39,
  number =	 {13--14},
  pages =	 {3777--3796},
  month =	 {June-July},
  abstract = {The ability of a digital image correlation technique to capture
        the heterogeneous deformation fields appearing during
        compression of ultra-light open-cell foams is presented in this
        article. Quantitative characterization of these fields is of
        importance to understand the mechanical properties of
        thecollapse process and the energy dissipation patterns in this
        type of materials. The present algorithm is formulated in the
        context of multi-variable non-linear optimization where a merit
        function based on a local average of the deformation mapping is
        minimized implicitly. A parallel implementation utilizing
        message passing interface for distributed-memory architectures
        isalso discussed. Estimates for optimal size of the correlation
        window basedon measurement accuracy and spatial resolution are
        provided. This technique is employed to reveal the evolution of
        the deformation texture on the surface of open-cell polyurethane
        foam samples of different relative densities. Histograms of the
        evolution of surface deformation are extracted, showingthe
        transition from unimodal to bimodal and back to unimodal. These
        results support the interpretation that the collapse of light
        open-cell foams occurs as a phase transition phenomenon.}
}

@Article{mor02:mpi-app,
  author = 	 {H. Moritsch and S. Benkner},
  title = 	 {High-performance numerical pricing methods},
  journal = 	 {Concurrency and Computation-Practice \& Experience},
  year = 	 2002,
  volume =	 14,
  number =	 {8--9},
  pages =	 {665--678},
  month =	 {July-August},
  abstract = {The pricing of financial derivatives is an important field in
        finance and constitutes a major component of financial
        management applications. The uncertainty of future events often
        makes analytic approaches infeasible and, hence, time-consuming
        numerical simulations are required, In the Aurora Financial
        Management System, pricing is performed on the basis of lattice
        representations of stochastic multidimensional scenario
        processes using the MonteCarlo simulation and Backward Induction
        methods, the latter allowing for the exploitation of
        shared-memory parallelism. We present the parallelization of a
        Backward Induction numerical pricing kernel on a cluster of SMPs
        using HPF+, an extended version of High-Performance Fortran.
        Based on languageextensions for specifying a hierarchical
        mapping of data onto an SMP cluster, the compiler generates a
        hybrid-parallel program combining distributed-memory and
        shared-memory parallelism. We outline the parallelization
        strategy adopted by the VFC compiler and present an experimental
        evaluation of the pricing kernel on an NEC SX-5 vector
        supercomputer and a Linux SMP cluster, comparing a pure MPI
        version to a hybrid-parallel MPI/OpenMP version.}
}

@Article{deK02:mpi-app,
  author = 	 {J. de Kloe and A. van der Steen and H. Oksuzoglu and
                  H. Dijkstra},  
  title = 	 {A fully implicit parallel ocean model using {MUMPS}},
  journal = 	 {Journal of Supercomputing},
  year = 	 2002,
  volume =	 23,
  number =	 2,
  pages =	 {167--183},
  month =	 SEP,
  abstract = {The formulation, implementation and performance of a new fully
        implicit parallel model of the ocean circulation is presented.
        Within this model, steady states can be traced in one of the
        control parameters. In addition, transient flows can be computed
        using relatively (compared to traditional ocean models) large
        time steps such that long integration times can be reached. The
        discretized equations of the ocean model are solved by the
        Newton-Raphson technique and the emerging linear systems are
        solved by a (MPI) version of the MUltifrontal Massively Parallel
        Solver. The performance of the code on an SGI Origin 2000
        platform is presented here using typical results for asector
        ocean flow.}
}

 
@Article{Oku:mpi-hpf-app,
  author = 	 {H. Okuda and N. Anan},
  title = 	 {Optimization of element-by-element {FEM} in {HPF} 1.1},
  journal = 	 {Concurrency and Computation-Practice \& Experience},
  year = 	 2002,
  volume =	 14,
  number =	 {8--9},
  pages =	 {647--663},
  month =	 {July-August},
  abstract = {In this study, Poisson's equation is numerically evaluated by
        the element-by-element (EBE) finite-element method in a parallel
        environment using HPF 1.1 (High-Performance Fortran). In order
        to achieve high parallel efficiency, the data structures have
        been altered to node-based data instead of mixtures of node- and
        element-based data, representing a node-based EBE finite-element
        scheme (nEBE). The parallel machine used in this study was the
        NEC SX-4, and experiments were performed on a single node having
        32 processors sharing common memory. The HPF compiler used in
        the experiments is HPF/SX Rev 2.0 released in 1997 (unofficial),
        which supports HPF 1.1. Models containing approximately 200000
        and 1500000 degrees of freedom were analyzed in order to
        evaluate the method. The calculation time, parallel efficiency,
        and memory used were compared. The performance of HPF in the
        conjugate gradientsolver for the large model, using the NEC SX-4
        compiler option -noshrunk, was about 85\% that of the message
        passing interface.}
}

@Article{eke02:mpi-app,
  author = 	 {T. Ekevid and N. E. Wiberg},
  title = 	 {A comparison of parallel implementation of explicit {DG} and
                  central difference method},
  journal = 	 {Communications in Numerical Methods in Engineering},
  year = 	 2002,
  volume =	 18,
  number =	 8,
  pages =	 {585--597},
  month =	 AUG,
  abstract = {Massive parallel computers have become more attractive for
        advanced numerical simulations since standard libraries for
        communication and synchronization; for example MPI have
        facilitated program development. The present paperdiscusses two
        parallel explicit time integration methods for wave propagation
        problems; the central difference method and the explicit version
        of thePI-PI discontinuous galerkin (DG) method. Based on the
        MIMD model, where data decomposition is accomplished by
        element-based grid partitioning, parallel versions of both
        algorithms have been implemented, using the same paradigm for
        inter-process communication. Numerical examples are illustrated
        and the achieved performance of the algorithms is discussed.}
}

@Article{LOlik2002,
       author   = "L. Oliker and X. Y. Li and P. Husbands and R. Biswas",
       title    = "Effects of ordering strategies and programming paradigms on sparse    matrix computations",
       journal  = "SIAM Review",
       volume   = "44",
       number   = "3",
       pages    = "373--393",
       month    = SEP,
       year     = "2002",
       abstract = "The conjugate gradient (CG) algorithm is perhaps the best-known
        iterative technique for solving sparse linear systems that are
        symmetric and positive definite. For systems that are ill
        conditioned, it is often necessary to use a preconditioning
        technique. In this paper, we investigate the effects ofvarious
        ordering and partitioning strategies on the performance of
        parallel CG and ILU(0) preconditioned CG (PCG) using different
        programming paradigms and architectures. Results show that for
        this class of applications, ordering significantly improves
        overall performance on both distributed and distributed
        shared-memory systems, cache reuse may be more important than
        reducing communication, it is possible to achieve
        message-passing performance using shared-memory constructs
        through careful data ordering and distribution, and a hybrid
        MPI+OpenMP paradigm increases programming complexity with little
        performance gain. A multithreaded implementation of CG on the
        Cray NITA does not require special ordering or partitioning to
        obtain high efficiency and scalability, giving it a distinct
        advantage for adaptive applications; however, it shows limited
        scalability for PCG due to a lack of thread-level parallelism."
}
@Article{SEMin2002,
       author   = "S. E. Minkoff",
       title    = "Spatial parallelism of a 3{D} finite difference velocity-stress elastic    wave propagation code",
       journal  = "SIAM Journal on Scientific Computing",
       volume   = "24",
       number   = "1",
       pages    = "1--19",
       month    = AUG,
       year     = "2002",
       abstract = "In three-dimensional isotropic elastic earth, the wave equation
        solution consists of three velocity components and six stresses.
        We discretize the partial derivatives using second order in time
        and fourth order in space staggered finite difference operators.
        The parallel implementation uses the message passing interface
        library for platform portability and spatial decomposition for
        efficiency. Most of the communication in the code consists of
        passing subdomain face information to neighboring processors.
        When the parallel communication is balanced against computation
        by allocating subdomains ofreasonable size, we observe excellent
        scaled speedup. Allocating subdomains of size 25 x 25 x 25 on
        each node, we achieve efficiencies of 94\\% on 128 processors of
        an Intel Paragon."
}
@Article{JKlei2002,
       author   = "J. Kleinjung and N. Douglas and J. Heringa",
       title    = "Parallelized multiple alignment",
       journal  = "Bioinformatics",
       volume   = "18",
       number   = "9",
       pages    = "1270--1271",
       month    = SEP,
       year     = "2002",
       abstract = "Multiple sequence alignment is a frequently used technique for
        analyzing sequence relationships. Compilation of large
        alignments is computationally expensive, but processing time can
        be considerably reduced when the computational load is
        distributed over many processors. Parallel processing
        functionality in the form of single-instruction multiple-data
        (SIMD) technology wasimplemented into the multiple alignment
        program Praline by using 'message passing interface' (MPI)
        routines. Over the alignments tested here, the parallelized
        program performed up to ten times faster on 25 processors
        compared to the single processor version."
}
@Article{AAfsa2002,
       author   = "A. Afsahi and N. J. Dimopoulos",
       title    = "Efficient communication using message prediction for clusters of    multiprocessors",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "14",
       number   = "10",
       pages    = "859--883",
       month    = AUG,
       year     = "2002",
       abstract = "With the increasing uniprocessor and symmetric multiprocessor
        computationalpower available today, interprocessor communication
        has become an important factor that limits the performance of
        clusters of workstations/multiprocessors. Many factors including
        communication hardware overhead, communication software
        overhead, and the user environment overhead (multithreading,
        multiuser) affect the performance of the communication
        subsystems in such systems. A significant portion of the
        software communication overhead belongs to a number of message
        copying operations. Ideally, it is desirable to have a true
        zero-copy protocol where the message is moved directly from the
        sendbuffer in its user space to the receive buffer in the
        destination without any intermediate buffering. However, due to
        the fact that message-passing applications at the send side do
        not know the final receive buffer addresses, early arrival
        messages have to be buffered at a temporary area. In this paper,
        we show that there is a message reception communication locality
        in message-passing applications. We have utilized this
        communication locality and devised different message predictors
        at the receiver sides of communications. In essence, these
        message predictors can be efficiently used to drainthe network
        and cache the incoming messages even if the corresponding
        receive calls have not yet been posted. The performance of these
        predictors, interms of hit ratio, on some parallel applications
        are quite promising and suggest that prediction has the
        potential to eliminate most of the remaining message copies. We
        also show that the proposed predictors do not have sensitivity
        to the starting message reception call, and that they perform
        better than (or at least equal to) our previously proposed
        predictors. "
}
@Article{TBohl2002,
       author   = "T. Bohlen",
       title    = "Parallel 3-{D} viscoelastic finite difference seismic modelling",
       journal  = "Computers \& Geosciences",
       volume   = "28",
       number   = "8",
       pages    = "887--899",
       month    = OCT,
       year     = "2002",
       abstract = "Computational power has advanced to a state where we can begin
        to perform wavefield simulations for realistic (complex) 3-D
        earth models at frequencies of interest to both seismologists
        and engineers. On serial platforms however, 3-D calculations are
        still limited to small grid sizes and short seismic wave
        traveltimes. To make use of the efficiency of network computers
        a parallel 3-D viscoelastic finite difference (FD) code is
        implemented which allows to distribute the work on several PCs
        or workstations connected via standard ethernet in an in-house
        network. By using the portable message passing interface
        standard (MPI) for the communication between processors, running
        times can be reduced and grid sizes can be increased
        significantly. Furthermore, the code shows good performance on
        massive parallel supercomputers which makes the computation of
        very large grids feasible. This implementation greatly expands
        the applicability of the 3-D elastic/viscoelastic
        finite-difference modelling technique by providing an efficient,
        portable and practical C-program. "
}
@Article{GRLue2002,
       author   = "G. R. Luecke and Y. Zou and J. Coyle and J. Hoekstra and M. Kraeva",
       title    = "Deadlock detection in {MPI} programs",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "14",
       number   = "11",
       pages    = "911--932",
       month    = SEP,
       year     = "2002",
       abstract = "The Message-Passing Interface (MPI) is commonly used to write
        parallel programs for distributed memory parallel computers.
        MPI-CHECK is a tool developed to aid in the debugging of MPI
        programs that are written in free or fixed format Fortran 90 and
        Fortran 77. This paper presents the methods used inMPI-CHECK 2.0
        to detect many situations where actual and potential deadlocks
        occur when using blocking and non-blocking point-to-point
        routines as well as when using collective routines."
}
@Article{ARMRa2002,
       author   = "A. R. M. Rao",
       title    = "A parallel mixed time integration algorithm for nonlinear dynamic    analysis",
       journal  = "Advances in Engineering Software",
       volume   = "33",
       number   = "5",
       pages    = "261--271",
       month    = MAY,
       year     = "2002",
       abstract = "This paper presents a parallel mixed time integration algorithm
        formulated by synthesising the implicit and explicit time
        integration techniques. The proposed algorithm is an extension
        of the mixed time integration algorithms[Comput. Meth. Appl.
        Mech. Engng 17/18 (1979) 259; Int. J. Numer. Meth. Engng 12
        (1978) 1575] being successfully employed for solving
        media-structureinteraction problems. The parallel algorithm for
        nonlinear dynamic response of structures employing mixed time
        integration technique has been devisedwithin the broad framework
        of domain decomposition. Concurrency is introduced into this
        algorithm, by integrating interface nodes with explicit time
        integration technique and later solving the local submeshes with
        implicit algorithm. A flexible parallel data structure has been
        devised to implement the parallel mixed time integration
        algorithm. Parallel finite element codehas been developed using
        portable Message Passing Interface software development
        environment. Numerical studies have been conducted on
        PARAM-10000 (Indian parallel supercomputer) to test the accuracy
        and also the performanceof the proposed algorithm. Numerical
        studies indicate that the proposed algorithm is highly adaptive
        for parallel processing. "
}
@Article{PMieh2002,
       author   = "P. Miehe and A. Sandu and G. R. Carmichael and Y. H. Tang and D. Daescu",
       title    = "A communication library for the parallelization of air quality models    on structured grids",
       journal  = "Atmospheric Environment",
       volume   = "36",
       number   = "24",
       pages    = "3917--3930",
       month    = AUG,
       year     = "2002",
       abstract = "PAQMSG is an MPI-based, Fortran 90 communication library for the
        parallelization of air quality models (AQMs) on structured
        grids. It consists of distribution, gathering and repartitioning
        routines for different domain decompositions implementing a
        master-worker strategy. The library is architectureand
        application independent and includes optimization strategies for
        different architectures. This paper presents the library from a
        user perspective. Results are shown from the parallelization of
        STEM-III on Beowulf clusters. The PAQMSG library is available on
        the web. The communication routines are easy to use, and should
        allow for an immediate parallelization of existing AQMs. PAQMSG
        can also be used for constructing new models. "
}
@Article{YLian2002,
       author   = "Y. Li and S. M. Sze and T. S. Chao",
       title    = "A practical implementation of parallel dynamic load balancing for    adaptive computing in {VLSI} device simulation",
       journal  = "Engineering with Computers",
       volume   = "18",
       number   = "2",
       pages    = "124--137",
       month    = "",
       year     = "2002",
       abstract = "We present a new parallel semiconductor device simulation using
        the dynamicload balancing approach. This semiconductor device
        simulation based on theadaptive finite volume method with a
        posteriori error estimation has been developed and successfully
        implemented on a 16-PC Linux cluster with a message passing
        interface library. A constructive monotone iterative technique
        is also applied for solution of the system of nonlinear
        algebraic equations. Two different parallel versions of the
        algorithm to perform a complete device simulation are proposed.
        The first is a dynamic parallel domain decomposition approach,
        and the second is a parallel current-voltage characteristic
        points simulation. This implementation shows that a
        well-designed load balancing simulation can significantly reduce
        the execution time up to an order of magnitude. Compared with
        the measured data, numerical results on various submicron VLSI
        devices are presented, to show the accuracy and efficiency of
        the method."
}
@Article{WHLiu2002,
       author   = "W. H. Liu and C. L. Wang and V. K. Prasanna",
       title    = "Portable and scalable algorithm for irregular all-to-all communication",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "62",
       number   = "10",
       pages    = "1493--1526",
       month    = OCT,
       year     = "2002",
       abstract = "In irregular all-to-all communication, messages are exchanged
        between everypair of processors. The message sizes vary from
        processor to processor andare known only at run time. This is a
        fundamental communication primitive in parallelizing irregularly
        structured scientific computations. Our algorithm reduces the
        total number of message start-ups. It also reduces node
        contention by smoothing out the lengths of the messages
        communicated. As compared to the earlier approaches, our
        algorithm provides deterministic performance and also reduces
        the buffer space at the nodes during message passing,The
        performance of the algorithm is characterised using a simple
        communication model of high-performance computing (HPC)
        platforms. We show the implementation on T3D and SP2 using C and
        the message passing interface standard. These can be easily
        ported to other HPC platforms. The results show the
        effectiveness of the proposed technique as well as the interplay
        among the machine size, the variance in message length, and the
        network interface. "
}
@Article{CRDow2002,
       author   = "C. R. Dow and J. S. Chen and M. C. Hsieh",
       title    = "Checkpointing {MPI} applications on symmetric multi-processor machines    using {SMPC}kpt",
       journal  = "Journal of Systems and Software",
       volume   = "63",
       number   = "2",
       pages    = "137--150",
       month    = AUG,
       year     = "2002",
       abstract = "Researchers from many different areas have requirements for
        computational power to solve their specific problems. Symmetric
        multi-processor (SMP) machines are also widely available and
        their processing capacity is in demand particularly for
        applications in areas such as virtual reality and multimedia.
        Checkpointing provides the backbone for rollback recovery
        (fault-tolerance), playback debugging, process migration and job
        swapping. Numerous checkpointing tools have been designed and
        implemented but few are based on SMP machines for MPI
        applications. This work designs, develops, and implements
        SMPCkpt, a checkpointing system for symmetric multi-processor
        environments.SMPCkpt supports a range of facilities, including
        transparent checkpointing, fault detection, and rollback
        recovery. Two coordinated checkpointing algorithms, barrier and
        non-barrier, are developed and implemented in SMPCkptthat can be
        used to reduce the execution down time in the presence of
        failures."
}
@Article{GEFag2002,
       author   = "G. E. Fagg and J. J. Dongarra",
       title    = "H{ARNESS} fault tolerant {MPI} design, usage and performance issues",
       journal  = "Future Generation Computer Systems",
       volume   = "18",
       number   = "8",
       pages    = "1127--1142",
       month    = OCT,
       year     = "2002",
       abstract = "Initial versions of MPI were designed to work efficiently on
        multi-processors which had very little job control and thus
        static process models. Subsequently forcing them to support a
        dynamic process model suitable for use on clusters or
        distributed systems would have reduced their performance. As
        current HPC collaborative applications increase in size and
        distribution the potential levels of node and network failures
        increase. This is especially true when MPI implementations are
        used as the communication media for GRID applications where the
        GRID architectures themselves are inherently unreliable thus
        requiring new fault tolerant MPI systems to be developed. Here
        we present a new implementation of MPI called FT-MPI that allows
        the semanticsand associated modes of failures to be explicitly
        controlled by an application via a modified MPI API. Given is an
        overview of the FT-MPI semantics, design, example applications
        and some performance issues such as efficient group
        communications and complex data handling. Also briefly described
        is the HARNESS g-hcore system that handles low-level system
        operations on behalf of the MPI implementation. This includes
        details of plug-in services developed and their interaction with
        the FT-MPI runtime library. "
}
@Article{SBolu2002,
       author   = "S. Boluriaan and P. J. Morris",
       title    = "Two-dimensional simulations of wake vortex detection using radio    acoustic sounding systems",
       journal  = "AIAA Journal",
       volume   = "40",
       number   = "11",
       pages    = "2247--2256",
       month    = NOV,
       year     = "2002",
       abstract = "A parallel code is developed to simulate numerically wake vortex
        detection using a radio acoustic sounding system (RASS). The
        code is written in FORTRAN 90 with the message passing interface
        for parallel implementation. The numerical simulation solves
        simultaneously the linearized Euler equations for a nonuniform
        mean flow and the Maxwell equations for a nonhomogeneous medium.
        The radar transmitter and receiver antennas are modeled using an
        arrayof point sources and a beam-forming technique,
        respectively. Many featuresof the RASS are explored using the
        numerical simulation. First, a uniform mean flow is considered,
        and the RASS simulation is performed for two different types of
        incident acoustic field: a short single-frequency acoustic pulse
        and a continuous broadband acoustic source. Both monostatic and
        bistatic configurations are examined, and their results are
        compared. Taylor and Oseen vortex velocity profiles are used as
        sample models, and their mean flowfields are reconstructed from
        the backscattered electromagnetic signal using the Abel
        transform. The effect of radar beam width is also considered, as
        are the issues of nonaxisymmetric and interacting vortices."
}
@Article{KEkic2002,
       author   = "K. Ekici and A. S. Lyrintzis",
       title    = "Parallelization of rotorcraft aerodynamics  {N}avier-{S}tokes codes",
       journal  = "AIAA Journal",
       volume   = "40",
       number   = "5",
       pages    = "887--896",
       month    = MAY,
       year     = "2002",
       abstract = "The modification of unsteady three-dimensional Navier-Stokes
        codes for application on massively parallel and distributed
        computing environments is investigated. Previously, the Euler
        mode of the Navier-Stokes code TURNS has been parallelized. For
        the efficient implementation of the Navier-Stokes mode of TURNS
        on parallel computing systems, several algorithmic changes
        should be made. The main modification is done on the implicit
        operator, lower-upper symmetric Gauss-Seidel. Two new implicit
        operators are used because of convergence problems of
        traditional operators with high cell aspect ratio grids needed
        for viscous calculations. Results for Navier-Stokes cases are
        presented for various operators. The message passing interface
        protocol is used because of its portability to various parallel
        architectures."
}
@Article{PHave2002,
       author   = "P. Have",
       title    = "Easy{MSG}: {T}ools and techniques for an adaptive overlapping in {SPMD}    programming",
       journal  = "Esaim-Mathematical Modelling and Numerical      Analysis-Modelisation Mathematique Et Analyse Numerique",
       volume   = "36",
       number   = "5",
       pages    = "863--882",
       month    = SEP-OCT,
       year     = "2002",
       abstract = "During the development of a parallel solver for Maxwell
        equations by integral formulations and Fast Multipole Method
        (FMM), we needed to optimize a critical part including a lot of
        communications and computations. Generally, many parallel
        programs need to communicate, but choosing explicitly the wayand
        the instant may decrease the efficiency of the overall program.
        So, the overlapping of computations and communications may be a
        way to reduce this drawback. We will see a implementation of
        this techniques using dynamic and adaptive overlapping based on
        the EasyMSG high level C++ library over MPI, a case of SPMD
        programming."
}
@Article{VESon2002,
       author   = "V. E. Sonzogni and A. M. Yommi and N. M. Nigro and M. A. Storti",
       title    = "A parallel finite element program on a {B}eowulf cluster",
       journal  = "Advances in Engineering Software",
       volume   = "33",
       number   = "7-10",
       pages    = "427--443",
       month    = JUL-OCT,
       year     = "2002",
       abstract = "Some experiences on writing a parallel finite element code on a
        Beowulf cluster are shown. This cluster is made up of seven
        Pentium III processors connected by Fast Ethernet. The code was
        written in C++ making use of MPI as message passing library and
        parallel extensible toolkit for scientific computations. The
        code presented here is a general framework where specific
        applications may be written. In particular CFD applications
        regarding Laplace equations, Navier-Stokes and shallow water
        flows have been implemented. The parallel performance of this
        application code is assessed and several numerical results are
        presented. "
}
@Article{GEise2002,
       author   = "G. Eisenhauer and F. E. Bustamante and K. Schwan",
       title    = "Native data representation: {A}n efficient wire format for    high-performance distributed computing",
       journal  = "IEEE Transactions on Parallel and Distributed Systems",
       volume   = "13",
       number   = "12",
       pages    = "1234--1246",
       month    = DEC,
       year     = "2002",
       abstract = "New trends in high-performance software development such as took
        and component-based approaches have increased the need for
        flexible and high-performance communication systems. When trying
        to reap the well-known benefits of these approaches, the
        question of what communication infrastructure should be used to
        link the various components arises. In this context, flexibility
        and high-performance seem to be incompatible goals. Traditional
        HPC-style communication libraries, such as MPI, offer good
        performance, but are not intended for loosely-coupled systems.
        Object- and metadata-based approaches like XML offer the needed
        plug-and-play flexibility, but with significantly lower
        performance. We observe that the flexibility and baseline
        performanceof data exchange systems are strongly determined by
        their wire formats, orby how they represent data for
        transmission in heterogeneous environments.After examining the
        performance implications of using a number of different wire
        formats, we propose an alternative approach for flexible
        high-performance data exchange, Native Data Representation, and
        evaluate its current implementation in the Portable Binary I/O
        library."
}
@Article{MFeil2002,
       author   = "M. Feil and A. Uhl",
       title    = "Wavelet packet image decomposition on {MIMD} architectures",
       journal  = "Real-Time Imaging",
       volume   = "8",
       number   = "5",
       pages    = "399--412",
       month    = OCT,
       year     = "2002",
       abstract = "In this work, we describe and analyze algorithms for 2D wavelet
        packet (WP)decomposition for multicomputers and multiprocessors.
        In the case of multicomputers, the main goal is the
        generalization of former parallel WP algorithms which are
        constrained to a number of processor elements equal to a power
        of 4. For multiprocessors, we discuss several optimizations of
        shared-memory algorithms and finally we compare the results
        obtained on multicomputers and multi-processors employing the
        message passing (MPI) and shared-memory programming (OpenMP)
        paradigm, respectively."
}
@Article{MSala2002,
       author   = "M. Sala",
       title    = "An algebraic 2-level domain decomposition preconditioner with    applications to the compressible {E}uler equations",
       journal  = "International Journal for Numerical Methods in Fluids",
       volume   = "40",
       number   = "12",
       pages    = "1551--1560",
       month    = DEC,
       year     = "2002",
       abstract = "Two possible schemes to introduce the coarse grid operator will
        be described. Both cases have been implemented and tested in a
        distributed parallel environment, using the MPI library. It will
        be shown that for suitable valuesof the rank of the coarse grid
        operator it is possible to obtain a considerable reduction in
        the number of iterations compared to the Schwarz preconditioner
        without coarse operator. "
}
@Article{SBenk2003,
       author   = "S. Benkner and V. Sipkova",
       title    = "Exploiting distributed-memory and shared-memory parallelism on clusters    of {SMP}s with data parallel programs",
       journal  = "International Journal of Parallel Programming",
       volume   = "31",
       number   = "1",
       pages    = "3--19",
       month    = FEB,
       year     = "2003",
       abstract = "Clusters of SMPs are hybrid-parallel architectures that combine
        the main concepts of distributed-memory and shared-memory
        parallel machines. Although SMP clusters are widely used in the
        high performance computing community, there exists no single
        programming paradigm that allows exploiting the hierarchical
        structure of these machines. Most parallel applications deployed
        onSMP clusters are based on MPI, the standard API for
        distributed-memory parallel programming, and thus may miss a
        number of optimization opportunitiesoffered by the shared memory
        available within SMP nodes. In this paper we present extensions
        to the data parallel programming language HPF and associated
        compilation techniques for optimizing HPF programs on clusters
        of SMPs. The proposed extensions enable programmers to control
        key aspects of distributed-memory and shared-memory
        parallelization at a high-level of abstraction. Based on these
        language extensions, a compiler can adopt a hybrid
        parallelization strategy which closely reflects the hierarchical
        structure of SMP clusters by automatically exploiting
        shared-memory parallelism based onOpenMP within cluster nodes
        and distributed-memory parallelism utilizing MPI across nodes.
        We describe the implementation of these features in the VFC
        compiler and present experimental results which show the
        effectiveness ofthese techniques."
}
@Article{MSMul2002,
       author   = "M. S. Muller and E. Gabriel and M. M. Resch",
       title    = "A software development environment for {G}rid computing",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "14",
       number   = "13-15",
       pages    = "1543--1551",
       month    = NOV-DEC,
       year     = "2002",
       abstract = "Grid computing has become a popular concept in the last few
        years. While inthe beginning the driving force was
        metacomputing, the focus has now shifted towards resource
        management issues and concepts like ubiquitous computing. For
        the High-Performance Computing Center Stuttgart (HLRS) the key
        challenges of Grid computing have come from the demands of its
        users and customers. With high-speed networks in place,
        programmers expect to be able to exploit the overall performance
        of several instruments and highspeed systems for their
        applications. In order to meet these demands, HLRS has set out a
        research effort to provide these users with the necessary tools
        to develop and run their codes on clusters of supercomputers.
        This has resulted in the development of a basic Grid-computing
        environment for technical and scientific computing. In this
        paper we describe the building blocks of this software
        development environment and focus specifically on communication
        and debugging. We present the Grid-enabled MPI implementation
        PACX-MPI and the MPI debugger MARMOT. "
}
@Article{GMahi2002,
       author   = "G. Mahinthakumar and F. Saied",
       title    = "A hybrid {MPI}-{O}pen{MP} implementation of an implicit finite-element code    on parallel architectures",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "16",
       number   = "4",
       pages    = "371--393",
       month    = WIN,
       year     = "2002",
       abstract = "The hybrid MPI-OpenMP model is a natural parallel programming
        paradigm for emerging parallel architectures that are based on
        symmetric multiprocessor (SMP) clusters. This paper presents a
        hybrid implementation adapted for an implicit finite-element
        code developed for groundwater transport simulations. The
        original code was parallelized for distributed memory
        architectures using MPI (Message Passing Interface) using a
        domain decomposition strategy. OpenMP directives were then added
        to the code (a straightforward loop-level implementation) to use
        multiple threads within each MPI process. To improve the OpenMP
        performance, several loop modifications were adopted. The
        parallel performance results are compared for four modern
        parallel architectures. The results show that for most of the
        cases tested, the pure MPI approach outperforms the hybrid
        model. The exceptions to this observation were mainly due to a
        limitation in the MPI library implementation on one of the
        architectures. A general conclusion is that while the hybrid
        model is a promising approach for SMP cluster architectures, at
        the time of this writing, the payoff may not be justified for
        converting all existing MPI codes to hybrid codes. However,
        improvements in OpenMP compilers combined with potential MPI
        limitations in SMP nodes may make the hybrid approach more
        attractive for a broader set of applications in the future."
}
@Article{DJMav2002,
       author   = "D. J. Mavriplis",
       title    = "Parallel performance investigations of an unstructured mesh    {N}avier-{S}tokes solver",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "16",
       number   = "4",
       pages    = "395--407",
       month    = WIN,
       year     = "2002",
       abstract = "The implementation and performance of a hybrid OpenMP/MPI
        parallel communication strategy for an unstructured mesh
        computational fluid dynamics code is described. The solver is
        cache efficient and fully vectorizable, and is parallelized
        using a two-level hybrid MPI-OpenMP implementation suitable
        forshared and/or distributed memory architectures, as well as
        clusters of shared memory machines. Parallelism is obtained
        through domain decomposition for both communication models.
        Single processor computational rates as well as scalability
        curves are given on various architectures. For the architectures
        studied in this work, the OpenMP or hybrid OpenMP/MPI
        communication strategies achieved no appreciable performance
        benefit over an exclusive MPI communication strategy."
}
@Article{IAhma2003,
       author   = "I. Ahmad",
       title    = "H{ARD}: {A} hypercube embedding algorithm for state assignment of finite    state machines",
       journal  = "Computers \& Electrical Engineering",
       volume   = "29",
       number   = "2",
       pages    = "327--356",
       month    = MAR,
       year     = "2003",
       abstract = "To minimize the area of the combinational circuit, required to
        realize a finite state machine (FSM), an efficient assignment of
        states of the FSM to aset of binary codes is required. As to
        find an optimal state assignment isNP-hard, therefore heuristic
        approaches have been taken. One approach generates an adjacency
        graph from the FSM model and then tries to embed the adjacency
        graph onto a hypercube with an objective to minimize the cost of
        mapping. However, hypercube embedding itself is an NP-complete
        problem. In this paper we present a solution to the hypercube
        embedding problem by designing a new technique, designated as
        HARD, that is a hybrid combination of non-linear programming
        method and a local search. We have transformed our problem from
        discrete space to continuous space and have applied logarithmic
        barrier function method, that in turn uses gradient projection
        approach to minimize the objective function. Each iteration of
        the gradient projection method produces a valid solution. Local
        search is performed around solution to improve its quality by
        using a Kernighan-Lin style algorithm. Two distributed
        algorithms for the HARD, have also been designed and implemented
        on network of workstations under message passing interface, to
        speed up the search. We have carried out a large number of
        experiments to deter-mine the efficiency of the HARD in terms of
        solution quality over many other techniques, and have obtained
        very promising results. "
}
@Article{AJGar2003,
       author   = "A. J. Garcia-Loureiro and J. M. Lopez-Gonzalez and T. F. Pena",
       title    = "A parallel 3{D} semiconductor device simulator for gradual heterojunction    bipolar transistors",
       journal  = "International Journal of Numerical      Modelling-Electronic Networks Devices and Fields",
       volume   = "16",
       number   = "1",
       pages    = "53--66",
       month    = JAN-FEB,
       year     = "2003",
       abstract = "In this paper, we present a parallel three-dimensional
        semiconductor devicesimulator for gradual heterojunction bipolar
        transistor. This simulator uses the drift-diffusion transport
        model. The Poisson equation and continuityequations were
        discretized using a finite element method (FEM) on an
        unstructured tetrahedral mesh. Fermi-Dirac statistics is
        considered in our modeland a compact formulation is used that
        makes it easy to take into account other effects such as the
        non-parabolic nature of the bands or the presenceof various
        subbands in the conduction process. Domain decomposition methods
        were tested to solve the linear systems. We have applied this
        simulator to a gradual heterojunction bipolar transistor (HBT),
        and we present some measures of the parallel execution time for
        several solvers and some electrical results. This code has been
        implemented for distributed memory multicomputers, making use of
        the MPI message passing standard library and a parallel solver
        library. "
}
@Article{HZSha2003,
       author   = "H. Z. Shan and J. P. Singh and L. Oliker and R. Biswas",
       title    = "Message passing and shared address space parallelism on an {SMP} cluster",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "2",
       pages    = "167--186",
       month    = FEB,
       year     = "2003",
       abstract = "Currently, message passing (MP) and shared address space (SAS)
        are the two leading parallel programming paradigms. MP has been
        standardized with MPI, and is the more common and mature
        approach; however, code development can be extremely difficult,
        especially for irregularly structured computations. SAS offers
        substantial ease of programming, but may suffer from
        performancelimitations due to poor spatial locality and high
        protocol overhead. In this paper, we compare the performance of
        and the programming effort requiredfor six applications under
        both programming models on a 32-processor PC-SMP cluster, a
        platform that is becoming increasingly attractive for
        high-endscientific computing. Our application suite consists of
        codes that typically do not exhibit scalable performance under
        shared-memory programming due to their high
        communication-to-computation ratios and/or complex communication
        patterns. Results indicate that SAS can achieve about half the
        parallelefficiency of MPI for most of our applications, while
        being competitive for the others. A hybrid MPI + SAS strategy
        shows only a small performance advantage over pure MPI in some
        cases. Finally, improved implementations of two MPI collective
        operations on PC-SMP clusters are presented. "
}
@Article{DBLei2003,
       author   = "D. B. Leineweber and A. Schafer and H. G. Bock and J. P. Schloder",
       title    = "An efficient multiple shooting based reduced {SQP} strategy for    large-scale dynamic process optimization - {P}art {II}: {S}oftware aspects    and applications",
       journal  = "Computers \& Chemical Engineering",
       volume   = "27",
       number   = "2",
       pages    = "167--174",
       month    = FEB,
       year     = "2003",
       abstract = "As model based optimization techniques play a more and more
        important role in the chemical process industries, there is a
        great demand for ever more efficient and reliable process
        optimization software. In the first part of this paper, the
        theoretical aspects of a tailored multiple shooting based
        solution strategy for dynamic process optimization have been
        presented (Leineweber, Bauer, Bock & Schloder, 2002. An
        efficient multiple shooting based reduced SQP strategy for
        large-scale dynamic process optimization-part 1: theoretical
        aspects). The current second part describes software aspects of
        the specific implementation muscod-ii and provides numerical
        results for several application examples. muscod-ii has been
        coupled with the dynamic process modeling software gPROMS via
        the standard equation set object (ESO) interface of CAPE-OPEN.
        Thereby, an advanced dynamic optimization platform forintegrated
        batch processes has been created, where each process stage is
        separately modeled in gPROMS, and the multistage dynamic
        optimization problem is assembled and solved with MUSCOD-II. The
        code has also been parallelized. based on the portable MPI
        standard. It is shown that the use of directional sensitivities
        becomes very important for larger problems with many algebraic
        variables, leading to drastically reduced computing times
        compared with strategies with complete constraint linearization.
        In addition, gPROMS ESO models are compared with classical
        Fortran models in terms of computational performance, and it is
        found that only a moderate loss of performance occurs if
        so-called in-process ESOs are employed. Finally, it is
        demonstrated that a significant speed-up can be obtained through
        parallel function and gradient evaluations. "
}
@Article{GLeuc2003,
       author   = "G. Leucke and H. Chen and J. Coyle and J. Hoekstra and M. Kraeva and Y. Zou",
       title    = "M{PI}-{CHECK}: a tool for checking {F}ortran 90 {MPI} programs",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "15",
       number   = "2",
       pages    = "93--100",
       month    = FEB,
       year     = "2003",
       abstract = "MPI is commonly used to write parallel programs for distributed
        memory parallel computers. MPI-CHECK is a tool developed to aid
        in the debugging of MPI programs that are written in free or
        fixed format Fortran 90 and Fortran 77. MPI-CHECK provides
        automatic compile-time and run-time checking of MPI programs.
        MPI-CHECK automatically detects the following problems in the
        useof MPI routines: (i) mismatch in argument type, kind, rank or
        number; (ii)messages which exceed the bounds of the
        source/destination array; (iii) negative message lengths; (iv)
        illegal MPI calls before MPI-INIT or after MPI_FINALIZE; (v)
        inconsistencies between the declared type of a message and its
        associated DATATYPE argument; and (vi) actual arguments which
        violate the INTENT attribute."
}
@Article{ILirk2003,
       author   = "I. Lirkov",
       title    = "M{PI} solver for 3{D} elasticity problems",
       journal  = "Mathematics and Computers in Simulation",
       volume   = "61",
       number   = "3-6",
       pages    = "509--516",
       month    = JAN,
       year     = "2003",
       abstract = "A portable MPI parallel FEM code is developed. Numerical tests
        for real-life engineering problems of the geomechanics in
        geosciences on a number of modem parallel computers are
        presented. The reported speed-up and parallel efficiency well
        illustrate the parallel features of the proposed method and its
        implementation. "
}
@Article{MGole2003,
       author   = "M. Golebiewski and H. Ritzdorf and J. L. Traff and F. Zimmermann",
       title    = "The {MPI}/{SX} implementation of {MPI} for {NEC}'s {SX}-6 and other {NEC} platforms",
       journal  = "NEC Research \& Development",
       volume   = "44",
       number   = "1",
       pages    = "69--74",
       month    = JAN,
       year     = "2003",
       abstract = "MPI is the standard communication interface for programming
        parallel applications in the message passing paradigm. MPI/SX is
        a dedicated, efficient and highly optimized implementation of
        the full MPI-2 standard for the NEC SX-series of parallel vector
        supercomputers. MPI/SX is also the basis for implementations of
        MPI for other NEC parallel platforms, for instance MPI/EX for
        AzusA and AsAmA, and for the Earth Simulator. This paper gives
        an overview of the key features and recent developments of
        MPI/SX. Among these are: highly optimized point-to-point and
        one-sided communications both within a single, shared-memory
        node and across nodes; optimized collective operations;
        efficient, vectorized handling of non-contiguous user data; and
        a non-trivial implementation of the MPI topology functionality.
        Although particularattention has been paid to efficient
        utilization of the vector-capabilities of the SX-machines, the
        architecture and optimizations of MPI/SX are immediately
        applicable to other NEC architectures."
}
@Article{HUeha2003,
       author   = "H. Uehara and M. Tamura and M. Yokokawa",
       title    = "M{PI} performance measurement on the earth simulator",
       journal  = "NEC Research \& Development",
       volume   = "44",
       number   = "1",
       pages    = "75--79",
       month    = JAN,
       year     = "2003",
       abstract = "MPI (Message Passing Interface) performance on the Earth
        Simulator is presented. Performance of MPI_Send, MPI Barrier,
        MPI RMA functions, and some programs of the exchange pattern
        have been evaluated on the Earth Simulator using the MPI
        benchmark program library. Regarding MPI performance on the
        Earth Simulator, the maximum throughput of the intranode
        communication is 14.8GB/s, and that of the internode
        communication is 11.8GB/s. The cost of MPI_Barrier call on the
        condition that the number of MPI-processes per PN is 1 is about
        3.3 microseconds, and MPI_Barrier has excellent scalability. It
        has been also confirmed that programming using MPI RMA functions
        is suitable to program complicated communication patterns such
        as the exchange pattern."
}
@Article{WOhfu2003,
       author   = "W. Ohfuchi and S. Shingu and H. Fuchigami and M. Yamada",
       title    = "Dependence of the parallel performance of the atmospheric general    circulation model for the earth simulator on problem size",
       journal  = "NEC Research \& Development",
       volume   = "44",
       number   = "1",
       pages    = "99--103",
       month    = JAN,
       year     = "2003",
       abstract = "An atmospheric general circulation model (AGCM), named AFES, was
        extensively optimized for the Earth Simulator (ES), and achieved
        sustained performance of 26.58TFLOPS or 65 \\% of the peak
        performance with the full configuration of the ES under more or
        less ideal conditions. The sensitivity of AFES's parallel
        performance on problem size is measured under more practical
        conditions in this study. Even though the amount of
        computational operation of Legendre transform increases as
        O(M-3), where M is the truncated wavenumber,while that of
        physical parameterization does as O(M-2), the M dependence of
        the computational cost does not behave as expected from the
        amount of computational operation due to the vector efficiency.
        Some aspects of the costof communication are also discussed. The
        results suggest the following. 1)AFES has been developed mainly
        for super-high resolution, and its coding is very effective only
        at high resolution. It may be difficult to make AFES very
        efficient at any, especially low, resolution. 2) In order to
        maintain high parallel efficiency on the ES, it is essential to
        employ MPI coding that keeps message size sufficiently large for
        efficient utilization of the ES's communication ability, and to
        keep vector length sufficiently large forefficient vector
        processing."
}
@Article{PBode2003,
       author   = "P. Bode and J. P. Ostriker",
       title    = "Tree particle-mesh: {A}n adaptive, efficient, and parallel code for    collisionless cosmological simulation",
       journal  = "Astrophysical Journal Supplement Series",
       volume   = "145",
       number   = "1",
       pages    = "1--13",
       month    = MAR,
       year     = "2003",
       abstract = "An improved implementation of an N-body code for simulating
        collisionless cosmological dynamics is presented. TPM (tree
        particle-mesh) combines the PMmethod on large scales with a tree
        code to handle particle-particle interactions at small
        separations. After the global PM forces are calculated,
        spatially distinct regions above a given density contrast are
        located; the tree code calculates the gravitational interactions
        inside these denser objects at higher spatial and temporal
        resolution. The new implementation includes individual particle
        time steps within trees, an improved treatment of tidal forces
        on trees, new criteria for higher force resolution and choice of
        time step, and parallel treatment of large trees. TPM is
        compared to (PM)-M-3 and a tree code (GADGET) and is found to
        give equivalent results in significantly less time. The
        implementation is highly portable (requiring a FORTRAN compiler
        and MPI) and efficient on parallel machines. The source code can
        be found on the World Wide Web."
}
@Article{JPSch2003,
       author   = "J. P. Schulze and U. Lang",
       title    = "The parallelized perspective shear-warp algorithm for volume rendering",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "3",
       pages    = "339--354",
       month    = MAR,
       year     = "2003",
       abstract = "In this paper, we present a new parallelized version of the
        perspective shear-warp algorithm. The parallelized algorithm was
        designed for distributed memory machines using MPI. The new
        algorithm takes advantage of the idea that the warp can be done
        in most computers' graphics hardware very fast, so that the
        remote parallel computer only needs to do the compositing. Our
        algorithm uses this idea to do the compositing on the remote
        machine, which transfers the resulting 2D intermediate image to
        the display machine. Even though the display machine can be a
        mid range PC or laptop computer, it can be used to display
        complex volumetric data, provided there is a network connection
        to a high performance parallel computer. Furthermore, remote
        rendering could be used to drive virtual environments, which
        typically require perspective projection and high frame rates
        for stereo projection and multiplescreens."
}
@Article{LChen2003,
       author   = "L. Chen and I. Fujishiro and K. Nakajima",
       title    = "Optimizing parallel performance of unstructured volume rendering for    the {E}arth {S}imulator",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "3",
       pages    = "355--371",
       month    = MAR,
       year     = "2003",
       abstract = "A scalable and high-performance parallel visualization subsystem
        has been developed in GeoFEM for the Earth Simulator (ES). As
        part of the ES project in Japan, the proposed subsystem is
        effective for the visualization of huge-scale unstructured
        datasets, and can be concurrent with computation on thesame
        high-performance parallel computer. Moreover, some parallel
        visualization modules have obtained a good parallel performance,
        covering scalar, vector and tensor fields. This paper will take
        volume rendering method as an example to describe a number of
        efficient parallel performance optimizationstrategies we adopted
        for large-scale unstructured data visualization on SMP cluster
        machines, including suitable design of visualization method,
        thethree-level hybrid parallelization which means message
        passing for inter-SMP node communication, loop directives for
        intra-SMP node parallelization, and vectorization for each
        processing element, plus dynamic load balancing.Good
        visualization. images and high parallel performance have been
        achieved on the ES, thus demonstrating the feasibility and
        effectiveness of the proposed method."
}
@Article{EAJoh2003,
       author   = "E. A. Johnson and C. Proppe and B. F. Spencer and L. A. Bergman and G. S. Szekely and  Schueller",
       title    = "Parallel processing in computational stochastic dynamics",
       journal  = "Probabilistic Engineering Mechanics",
       volume   = "18",
       number   = "1",
       pages    = "37--60",
       month    = JAN,
       year     = "2003",
       abstract = "Studying large complex problems that often arise in
        computational stochastic dynamics (CSD) demands significant
        computer power and data storage. Parallel processing can help
        meet these requirements by exploiting the computational and
        storage capabilities of multiprocessing computational
        environments. The challenge is to develop parallel algorithms
        and computational strategies that can take full advantage of
        parallel machines. This paper reviews some of the
        characteristics of parallel computing and the techniques used
        toparallelize computational algorithms in CSD. The
        characteristics of parallel processor environments are
        discussed, including parallelization through the use of message
        passing and parallelizing compilers. Several applications of
        parallel processing in CSD are then developed: solutions of the
        Fokker-Planck equation, Monte Carlo simulation of dynamical
        systems, and random eigenvector problems. In these examples,
        parallel processing is seen to be apromising approach through
        which to resolve some of the computational issues pertinent to
        CSD. "
}
@Article{ANeli2003,
       author   = "A. Nelisse and J. Maassen and T. Kielmann and H. E. Bal",
       title    = "C{CJ}: object-base message passing and collective communication in {J}ava",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "15",
       number   = "3-5",
       pages    = "341--369",
       month    = MAR-APR,
       year     = "2003",
       abstract = "CCJ is a communication library that adds MPI-like message
        passing and collective operations to Java. Rather than trying to
        adhere to the precise MPI syntax, CCJ aims at a clean
        integration of communication into Java's object-oriented
        framework. For example, CCJ uses thread groups to support Java's
        multithreading model and it allows any data structure (not just
        arrays) to be communicated. CCJ is implemented entirely in Java,
        on top of RMI, so it can be used with any Java virtual machine.
        The paper discusses three parallel Java applications that use
        collective communication. It compares the performance (on top of
        a Myrinet cluster) of CCJ, RMI and mpiJava versions of these
        applications and also compares their code complexity. A detailed
        performance comparison between CCJ and mpiJava is given using
        the Java Grande Forum MPJ benchmark suite. The results show that
        neither CCJ's object-oriented design nor its implementation on
        top of RMI impose a performance penalty on applications compared
        to their mpiJava counterparts. The source of CCJ is available
        from our Web site http://www.cs.vu.nl/manta. "
}
@Article{LTang2003,
       author   = "L. Tang and R. E. Bartels and P. C. Chen and D. D. Liu",
       title    = "Numerical investigation of transonic limit cycle oscillations of a    two-dimensional supercritical wing",
       journal  = "Journal of Fluids and Structures",
       volume   = "17",
       number   = "1",
       pages    = "29--41",
       month    = JAN,
       year     = "2003",
       abstract = "CFD-based aeroelastic computations are performed to investigate
        the effect of nonlinear aerodynamics on transonic limit cycle
        oscillation (LCO)characteristics of the NLR7301 airfoil section.
        It is found that the LCO solutionsfrom Navier-Stokes
        computations deviate less from the experiment than an Euler
        solution but strongly depend on the employed turbulence model.
        The Degani-Schiff modification to the Baldwin-Lomax turbulence
        model provokes spurious vorticity spots causing multiple shocks
        which might be unphysical, while the Spalart-Allmaras turbulence
        model yields physically reasonable unsteady shocks. In the cases
        examined, smaller initial perturbations lead to larger LCO
        amplitudes and vice versa, in contradiction to what one might
        expect. The amplitude of the initial perturbation is also found
        to have an impact on the mean position of LCO. Also addressed in
        the paper are aspects of multiblock message passing interface
        (MPI) parallel computation techniques as related to the present
        problem."
}

@Article{JHGuo2003,
       author   = "J. H. Guo and T. R. Taha",
       title    = "Parallel implementation of the split-step and the pseudospectral    methods for solving higher {K}d{V} equation",
       journal  = "Mathematics and Computers in Simulation",
       volume   = "62",
       number   = "1-2",
       pages    = "41--51",
       month    = FEB,
       year     = "2003",
       abstract = "Numerical simulations show that higher order KdV equation under
        certain conditions has a self-focusing singularity, which means
        that the solution of the equation blows up in finite time. In
        this paper, two numerical schemes: the split-step Fourier
        transform and the pseudospectral methods are used toinvestigate
        this self-focusing singularity problem. Parallel algorithms for
        the proposed schemes are designed and implemented. FFTW-MPI
        algorithm designed by Matteo Frigo and Steven Johnson is used
        for parallel implementation of the discrete Fourier transform
        (DFT). The parallel algorithms are implemented on an SGI Origin
        2000 multiprocessor computer and experiments show that a
        considerable speedup is attained."
}

@Article{DBore2003,
       author   = "D. Borello and A. Corsini and F. Rispoli",
       title    = "A finite element overlapping scheme for turbomachinery flows on    parallel platforms",
       journal  = "Computers \& Fluids",
       volume   = "32",
       number   = "7",
       pages    = "1017--1047",
       month    = AUG,
       year     = "2003",
       abstract = "Two- and three-dimensional turbomachinery flows in stationary
        and rotating compressor cascades are studied by using a
        one-level inexact explicit Schwarz method, and a cubic eddy
        viscosity turbulence closure. The message passing paradigm is
        used for the parallel implementation of the domain decomposition
        algorithm, allowing the solver portability on different parallel
        platforms. A convergence accelerator is proposed, based on a
        condensed cycle structure that merges the additive Schwarz
        iterations with the fixed point non-linear ones. The use of a
        stable finite element formulation on higher-order elements Q2-Q1
        is addressed as a mean for retaining non-oscillatory and
        accurate solutions. Furthermore, the elementwise quadratic
        approximation is used to enable the exact implementation of
        higher-order integrals arising in the anisotropic turbulence
        closure adopted. Numerical campaigns are carried out on IBM SP2
        and SP3, and CRAY T3E architectures, in order to demonstrate the
        portability. The accompanying performance improvement is
        assessed. Finally, the predicting capabilities are discussed
        with reference to challenging turbomachinery test cases: a
        transitional linear compressor cascade, and an isolated
        compressor rotor designed for non-free vortex operation.
        Convergence speed-up in such configurations is discussed."
}
@Article{TRabc2003,
       author   = "T. Rabczuk and J. Eibl",
       title    = "Simulation of high velocity concrete fragmentation using {SPH}/{MLSPH}",
       journal  = "International Journal for Numerical Methods in Engineering",
       volume   = "56",
       number   = "10",
       pages    = "1421--1444",
       month    = MAR,
       year     = "2003",
       abstract = "The simulation of concrete fragmentation under explosive loading
        by a meshfree Lagrangian method, the smooth particle
        hydrodynamics method (SPH) is described. Two improvements
        regarding the completeness of the SPH-method are examined, first
        a normalization developed by Johnson and Beissel (NSPH)
        andsecond a moving least square (MLS) approach as modified by
        Scheffer (MLSPH). The SPH-Code is implemented in FORTRAN 90 and
        parallelized with MPI. A macroscopic constitutive law with
        isotropic damage for fracture and fragmentation for concrete is
        implemented in the SPH-Code. It is shown that the SPH-method is
        able to simulate the fracture and fragmentation of concrete
        slabs under contact detonation. The numerical results from the
        different SPH-methods are compared with the data from tests. The
        good agreement between calculation and experiment suggests that
        the SPH-program can predict the correct maximum pressure as well
        as the damage of the concrete slabs. Finally the fragment
        distributions of the tests and the numerical calculations are
        compared."
}
@Article{ACris2003,
       author   = "A. Cristobal-Salas and A. Tchernykh and J. L. Gaudiot and W. Y. Lin",
       title    = "Non-strict execution in parallel and distributed computing",
       journal  = "International Journal of Parallel Programming",
       volume   = "31",
       number   = "2",
       pages    = "77--105",
       month    = APR,
       year     = "2003",
       abstract = "This paper surveys and demonstrates the power of non-strict
        evaluation in applications executed on distributed
        architectures. We present the design, implementation, and
        experimental evaluation of single assignment, incompletedata
        structures in a distributed memory architecture and Abstract
        Network Machine (ANM). Incremental Structures (IS), Incremental
        Structure Software Cache (ISSC), and Dynamic Incremental
        Structures (DIS) provide non-strict data access and fully
        asynchronous operations that make them highly suited for the
        exploitation of fine-grain parallelism in distributed memory
        systems. We focus on split-phase memory operations and
        non-strict information processing under a distributed address
        space to improve the overall system performance. A novel
        technique of optimization at the communication level is proposed
        and described. We use partial evaluation of local and remote
        memory accesses not only to remove much of the excess overhead
        of message passing,but also to reduce the number of messages
        when some information about the input or part of the input is
        known. We show that split-phase transactions of IS, together
        with the ability of deferring reads, allow partial evaluation of
        distributed programs without losing determinacy. Our
        experimental evaluation indicates that commodity PC clusters
        with both IS and a caching mechanism, ISSC, are more robust. The
        system can deliver speedup for both regular and irregular
        applications. We also show that partial evaluation of memory
        accesses decreases the traffic in the interconnection network
        and improves the performance of MPI IS and MPI ISSC applications."
}
@Article{VNAle2003,
       author   = "V. N. Alexandrov and I. T. Dimov and A. Karaivanova and C. J. K. Tan",
       title    = "Parallel {M}onte {C}arlo algorithms for information retrieval",
       journal  = "Mathematics and Computers in Simulation",
       volume   = "62",
       number   = "3-6",
       pages    = "289--295",
       month    = MAR,
       year     = "2003",
       abstract = "The algorithms are running on a cluster of workstations under
        MPI and results of the experiments arising in textual retrieval
        of Web documents as wellas comparison of the stochastic methods
        proposed are presented. "
}
@Article{YMLia2003,
       author   = "Y. M. Li and H. M. Lu and T. W. Tang and S. M. Sze",
       title    = "A novel parallel adaptive {M}onte {C}arlo method for nonlinear {P}oisson    equationin semiconductor devices",
       journal  = "Mathematics and Computers in Simulation",
       volume   = "62",
       number   = "3-6",
       pages    = "413--420",
       month    = MAR,
       year     = "2003",
       abstract = "We present a parallel adaptive Monte Carlo (MC) algorithm for
        the numericalsolution of the nonlinear Poisson equation in
        semiconductor devices. Basedon a fixed random walk MC method,
        1-irregular unstructured mesh technique,monotone iterative
        method, a posterior error estimation method, and dynamic domain
        decomposition algorithm, this approach is developed and
        successfully implemented on a 16-processors (16-PCs)
        Linux-cluster with message-passing interface (MPI) library. To
        solve the nonlinear problem with MC method, monotone iterative
        method is applied in each adaptive loop to obtain the final
        convergent solution. This approach fully exploits the inherent
        parallelism of the monotone iterative as well as MC methods.
        Numerical results for p-n diode and MOSFET devices are
        demonstrated to show the robustness of themethod. Furthermore,
        achieved parallel speedup and related parallel performances are
        also reported in this work. "
}
@Article{SCast2003,
       author   = "S. Castellaro and F. Mulargia",
       title    = "Implementing cellular automata models for earthquakes on parallel    computers- art. no. 1204",
       journal  = "Geophysical Research Letters",
       volume   = "30",
       number   = "5",
       pages    = "1204--1204",
       month    = MAR,
       year     = "2003",
       abstract = "[1] Cellular automata models require simulations on both large
        grids, to avoid border effects, and on a large number of
        realizations, to study the system properties under stationarity.
        Implementing the cellular automata codeson parallel computers
        would appear as an ideal solution. Unfortunately, the cellular
        automata models which are appropriate for earthquakes can only
        be partially parallelized because they have an intrinsically
        sequential component. Under extensive modeling using MPI on CRAY
        T3 and Origin 8300 supercomputers we show that a substantial
        speed-up can nevertheless be achieved by coarsening the system
        and making a few mild assumptions on the logical flow of the
        interactions among macrocells."
}
@Article{RBrig2003,
       author   = "R. Brightwell and R. Riesen and A. B. Maccabe",
       title    = "Design, implementation, and performance of {MPI} on {P}ortals 3.0",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "17",
       number   = "1",
       pages    = "7--20",
       month    = SPR,
       year     = "2003",
       abstract = "This paper describes an implementation of the Message Passing
        Interface (MPI) on the Portals 3.0 data movement layer. Portals
        3.0 provides low-level building blocks that are flexible enough
        to support higher-level message passing layers, such as MPI,
        very efficiently. Portals 3.0 is also designed toallow for
        programmable network interface cards to offload message
        processing from the host processor, allowing for the ability to
        overlap computationand MPI communication. We describe the basic
        building blocks in Portals 3.0, show how they can be put
        together to implement MPI, and describe the protocols of our MPI
        implementation. We look at several key operations within the
        implementation and describe the effects that a Portals 3.0
        implementation has on scalability and performance. We also
        present preliminary performance results from our implementation
        for Myrinet."
}
@Article{FGarc2003,
       author   = "F. Garcia-Carballeira and A. Calderon and J. Carretero and J. Fernandez and J. M. Perez",
       title    = "The design of the expand parallel file system",
       journal  = "International Journal of High Performance      Computing Applications",
       volume   = "17",
       number   = "1",
       pages    = "21--37",
       month    = SPR,
       year     = "2003",
       abstract = "This article describes an implementation of MPI-IO using a new
        parallel file system, called Expand (Expandable Parallel File
        System), which is based on NFS servers. Expand combines multiple
        NFS servers to create a distributedpartition where files are
        striped. Expand requires no changes to the NFS server and uses
        RPC operations to provide parallel access to the same file.
        Expand is also independent of the clients, because all
        operations are implemented using RPC and NFS protocols. Using
        this system, we can join heterogeneous servers (Linux, Solaris,
        Windows 2000, etc.) to provide a parallel and distributed
        partition. The article describes the design, implementation and
        evaluation of Expand with MPI-IO. This evaluation has been made
        in Linuxclusters and compares Expand and PVFS."
}
@Article{RRabe2003,
       author   = "R. Rabenseifner and G. Wellein",
       title    = "Communication and optimization aspects of parallel programming models    on hybrid architectures",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "17",
       number   = "1",
       pages    = "49--62",
       month    = SPR,
       year     = "2003",
       abstract = "Most HPC systems are clusters of shared memory nodes. Parallel
        programming must combine the distributed memory parallelization
        on the node interconnect with the shared memory parallelization
        inside each node. The hybrid MPI+OpenMP programming model is
        compared with pure MPI, compiler based parallelization, and
        other parallel programming models on hybrid architectures. The
        paper focuses on bandwidth and latency aspects, and also on
        whether programming paradigms can separate the optimization of
        communication and computation. Benchmark results are presented
        for hybrid and pure MPI communication. This paper analyzes the
        strengths and weaknesses of several parallel programming models
        on clusters of SMP nodes."
}
@Article{SLafl2003,
       author   = "S. Laflamme and J. Dompierre and F. Guibault and R. Roy",
       title    = "Applying parmetis to structured remeshing for industrial {CFD}    applications",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "17",
       number   = "1",
       pages    = "63--76",
       month    = SPR,
       year     = "2003",
       abstract = "This paper presents the current strategy used in IP-OORT an
        ongoing projectto extend the application domain of a C++ toolkit
        library for iterative mesh adaptation. OORT is a class library
        for sequential structured, unstructured and hybrid mesh
        adaptation used mainly in the context of CFD computations, that
        performs iterative mesh refinement, coarsening and smoothing in
        3D. Extensions to parallelize mesh adaptation using PARMETIS for
        domain decomposition and MPI high-level communication schemes
        are investigated here. Numerical simulations on realistic cases
        show that the parallel strategy scales with problem size and the
        number of processors, but singular behaviors are sometimes
        encountered at subdomain interfaces when conflicting
        instructions collide."
}
@Article{MPern2003,
       author   = "M. Pernpointner and L. Visscher",
       title    = "Parallelization of four-component calculations. {II}. {S}ymmetry-driven    parallelization of the 4-spinor {CCSD} algorithm",
       journal  = "Journal of Computational Chemistry",
       volume   = "24",
       number   = "6",
       pages    = "754--759",
       month    = APR,
       year     = "2003",
       abstract = "Given the importance of the Coupled-cluster (CC) method as an
        efficient andaccurate way to take electron correlation into
        account, we extend the parallelization technique in the second
        part of this series also to the 4-Spinor CCSD algorithm
        implemented in the Dirac-Fock packages DIRAC and MOLFDIR. The
        present implementation is based on the availability of the
        transformed molecular two-electron integrals on an external
        storage medium. The linearity of the CC equations in these
        two-electron integrals is used in a parallelization strategy
        that is based on distribution of the two largest integralclasses
        that carry three or four virtual spinor indices. The
        correspondingpartial contributions to the T-1 and T-2 amplitudes
        are calculated on eachnode and added using Message Passing
        Interface (MPI) library calls. Although we did not employ a
        master/slave principle, one specific node was assigned to also
        perform the remaining serial parts of the algorithm. In the
        critical sections considerable savings in storage requirements
        and computer time could be achieved, and this allows for
        computations on larger systems in the framework of
        four-component theory. "
}
@Article{YPana2003,
       author   = "Y. Pan and J. J. S. Shang and M. Guo",
       title    = "A scalable {HPF} implementation of a finite-volume computational    electromagnetics application on a {CRAY} {T}3{E} parallel systemt",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "15",
       number   = "6",
       pages    = "607--621",
       month    = MAY,
       year     = "2003",
       abstract = "The time-dependent Maxwell equations are one of the most
        important approaches to describing dynamic or wide-band
        frequency electromagnetic phenomena. A sequential finite-volume,
        characteristic-based procedure for solving the time-dependent,
        three-dimensional Maxwell equations has been successfully
        implemented in Fortran before. Due to its need for a large
        memory space and high demand on CPU time, it is impossible to
        test the code for a large array. Hence, it is essential to
        implement the code on a parallel computing system. In this
        paper, we discuss an efficient and scalable parallelization
        ofthe sequential Fortran time-dependent Maxwell equations solver
        using High Performance Fortran (HPF). The background to the
        project, the theory behindthe. efficiency being achieved, the
        parallelization methodologies employedand the experimental
        results obtained on the Cray T3E massively parallel computing
        system will be described in detail. Experimental runs show that
        the execution time is reduced drastically through parallel
        computing. The code is scalable up to 98 processors on the Cray
        T3E and has a performance similar to that of an MPI
        implementation. Based on the experimentation carriedout in this
        research, we believe that a high-level parallel programming
        language such as HPF is a fast, viable and economical approach
        to parallelizing many existing sequential codes which exhibit a
        lot of parallelism."
}
@Article{FIsai2003,
       author   = "F. Isaila and W. F. Tichy",
       title    = "Clusterfile: a flexible physical layout parallel file system",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "15",
       number   = "7-8",
       pages    = "653--679",
       month    = JUN-JUL,
       year     = "2003",
       abstract = "This paper presents Clusterfile, a parallel file system that
        provides parallel file access on a cluster of computers. We
        introduce a file partitioningmodel that has been used in the
        design of Clusterfile. The model uses a data representation that
        is optimized for multidimensional array partitioningwhile
        allowing arbitrary partitions. The paper shows how the file
        model can be employed for file partitioning into both physical
        subfiles and logicalviews. We also present how the conversion
        between two partitions of the same file is implemented using a
        general memory redistribution algorithm. We show how we use the
        algorithm to optimize non-contiguous read and write operations.
        The experimental results include performance comparisons with
        the Parallel Virtual File System (PVFS) and an MPI-IO
        implementation for PVFS."
}
@Article{FYuas2003,
       author   = "F. Yuasa and K. Tobimatsu and S. Kawabata",
       title    = "Parallelization of the multidimensional integration package: {DICE}",
       journal  = "Nuclear Instruments \& Methods in Physics Research Section      A-Accelerators Spectrometers Detectors and Associated Equipment",
       volume   = "502",
       number   = "2-3",
       pages    = "599--601",
       month    = APR,
       year     = "2003",
       abstract = "We have parallelized the multidimensional integration package
        DICE by distributing sample points into the processors using MPI
        and evaluated its performance. "
}

@Article{SCDon2003,
       author   = "S. C. Dong and G. E. Karniadakis",
       title    = "P-refinement and {P}-threads",
       journal  = "Computer Methods in Applied Mechanics and Engineering",
       volume   = "192",
       number   = "19",
       pages    = "2191--2201",
       month    = "",
       year     = "2003",
       abstract = {P-type refinement leads to exponential decay of numerical errors
        for sufficiently smooth solutions and has been used effectively
        in turbulence and structural mechanics simulations in the
        context of spectral and hp finite element discretizations.
        However, it induces a computational cost of O(Pd+1) ind
        dimensions, which is higher than lower-order methods. In this
        paper, we demonstrate that by employing multi-threading within
        MPI processes we manage to counter-balance the cost increase
        associated with P-refinement. This approach reduces effectively
        the wall clock time, and keeps it essentially constant as the
        polynomial order is increased while achieving exponential
        convergence rate. Since the number of threads within MPI
        processes can be dynamically adjusted through thread library
        functions, the algorithm can be readily adapted for dynamic
        P-refinement. The resulting hybrid MPI/threads dual-level
        parallelism is particularly suitable for modern supercomputers
        consisting of "symmetric multiprocessor" nodes. We demonstrate
        this approach in simulations of two three-dimensional fluid
        dynamics problems.}
}
@Article{KEkic2003,
       author   = "K. Ekici and A. S. Lyrintzis",
       title    = "A parallel {N}ewton-{K}rylov method for {N}avier-{S}tokes rotorcraft codes",
       journal  = "International Journal of Computational Fluid Dynamics",
       volume   = "17",
       number   = "3",
       pages    = "225--230",
       month    = MAY,
       year     = "2003",
       abstract = "The application of Krylov subspace iterative methods to unsteady
        three-dimensional Navier-Stokes codes on massively parallel and
        distributed computingenvironments is investigated. Previously,
        the Euler mode of the Navier-Stokes flow solver Transonic
        Unsteady Rotor Navier-Stokes (TURNS) has been coupled with a
        Newton-Krylov scheme which uses two Conjugate-Gradient-like (CG)
        iterative methods. For the efficient implementation of
        Newton-Krylov methods to the Navier-Stokes mode of TURNS,
        efficient preconditioners must be used. Parallel implicit
        operators are used and compared as preconditioners. Results are
        presented for two-dimensional and three-dimensional viscous
        cases. The Message Passing Interface (MPI) protocol is used,
        because of its portability to various parallel architectures."
}
@Article{PAmic2003,
       author   = "P. Amico and L. Bosi and C. Cattuto and L. Gammaitoni and F. Marchesoni and  Punturo",
       title    = "A parallel {B}eowulf-based system for the detection of gravitational    waves in interferometric detectors",
       journal  = "Computer Physics Communications",
       volume   = "153",
       number   = "2",
       pages    = "179--189",
       month    = JUN,
       year     = "2003",
       abstract = "The detection, in a modem interferometric detector like Virgo,
        of a gravitational wave signal from a coalescing binary stellar
        system is an intensive computational task both for the on-line
        and off-line computer systems. A parallel computing scheme using
        the Message Passing Interface (MPI) is described. Performance
        results on a small scale cluster are reported. "
}
@Article{SWGao2003,
       author   = "S. W. Gao",
       title    = "Linear-scaling parallelization of the {WIEN} package with {MPI}",
       journal  = "Computer Physics Communications",
       volume   = "153",
       number   = "2",
       pages    = "190--198",
       month    = JUN,
       year     = "2003",
       abstract = "A parallel version of the WIEN package, the full-potential
        linearized Augmented Planewave (FP-LAPW) code for ab initio
        electron structure calculation,has been developed using the
        message passing interface (MPI). All time-consuming parts of the
        self-consistent cycle, namely, the matrix setting, the
        eigen-solver, and the charge density and potential generators,
        have been parallelized on the level of the plane-wave basis,
        wherever possible, and/or of atomic loops. Test calculations
        done on Linux commodity cluster and the IBM power3
        supercomputers show that the parallel code attains nearly
        linearscaling for almost all the time-consuming calculations. It
        opens the possibility to handle large systems with the
        full-potential method on the parallel platforms. "
}

@Article{VBlan2003,
       author   = "V. Blanco and P. Gonzalez and J. C. Cabaleiro and D. B. Heras and T. F. Pena and  Pombo",
       title    = "A{VISPA}: visualizing the performance prediction of parallel iterative    solvers",
       journal  = "Future Generation Computer Systems",
       volume   = "19",
       number   = "5",
       pages    = "721--733",
       month    = JUL,
       year     = "2003",
       abstract = "The selection of the best method and preconditioner for solving
        a sparse linear system is as determinant as the efficient
        parallelization of the selected method. We propose a tool for
        helping to solve both problems on distributed memory
        multiprocessors using iterative methods. Based on a previously
        developed library of HPF and message-passing interface (MPI)
        codes, a performance prediction is developed and a visualization
        tool (AVISPA) is proposed. The tool combines theoretical
        features of the methods and preconditioners with practical
        considerations and predictions about aspects of the execution
        performance (computational cost, communications overhead, etc.).
        It offers detailed information about all the topics that can be
        useful for selecting the most suitable method and
        preconditioner. Another capability is to offer information on
        different parallel implementations of the code (HPF andMPI)
        varying the number of available processors. "
}

@Article{RHReu2003,
       author   = "R. H. Reussner",
       title    = "Using {SK}a{MPI} for developing high-performance {MPI} programs with    performance portability",
       journal  = "Future Generation Computer Systems",
       volume   = "19",
       number   = "5",
       pages    = "749--759",
       month    = JUL,
       year     = "2003",
       abstract = "The current practice of developing high-performance software for
        parallel computers includes a tuning phase where the software's
        performance is optimised for a specific hardware platform. This
        tuning phase often is costly andresults in machine-specific,
        hence, less portable software. In this paper we present a
        publicly available database providing performance data for
        operations of the message-passing-interface (MPI) measured on
        several different platforms. This allows to design MPI programs
        for performance and portability in early stages of software
        development. Considering the performance of MPI operations while
        designing programmes allows the software developer (a) to select
        the fastest implementation alternative, (b) to write performance
        portable software (i.e., software showing high performance on
        several platforms without platform-specific tuning), if
        possible, and (c) to quantifythe tradeoff between ultimate
        performance and performance portability for different platforms."
}
@Article{DLazz2003,
       author   = "D. Lazzaro",
       title    = "A parallel multivariate interpolation algorithm with radial basis    functions",
       journal  = "International Journal of Computer Mathematics",
       volume   = "80",
       number   = "7",
       pages    = "907--919",
       month    = JUL,
       year     = "2003",
       abstract = {This paper presents an efficient and highly scalable parallel
        version of the Modified RBF Shepard's method presented in [5].
        This method maintains the "metric" nature and the advantages of
        Shepard's method and, at the same time, improves its accuracy by
        exploiting the characteristics of flexibility and accuracy which
        have made the radial basis functions a well-established tool for
        multivariate interpolation. Due to its locality, this method can
        be easily and efficiently parallelized on a distributed memory
        parallel architecture. The performance of the parallel algorithm
        has been studied theoretically and the experimental results
        obtained by running its implementationon a Cray T3E parallel
        machine, using the MPI interface, confirm the theoretical
        efficiency.}
}
@Article{DShir2003,
       author   = "D. Shires and R. Mohan",
       title    = "Optimization and performance of a {F}ortran 90 {MPI}-based unstructured    code onlarge-scale parallel systems",
       journal  = "Journal of Supercomputing",
       volume   = "25",
       number   = "2",
       pages    = "131--141",
       month    = JUN,
       year     = "2003",
       abstract = "The message-passing interface (MPI) has become the standard in
        achieving effective results when using the message passing
        paradigm of parallelization.Codes written using MPI are
        extremely portable and are applicable to both clusters and
        massively parallel computing platforms. Since MPI uses the
        single program, multiple data (SPMD) approach to parallelism,
        good performancerequires careful tuning of the serial code as
        well as careful data and control flow analysis to limit
        communication. We discuss optimization strategies used and their
        degree of success to increase performance of an
        MPI-basedunstructured finite element simulation code written in
        Fortran 90. We discuss performance results based on
        implementations using several modern massively parallel
        computing platforms including the SGI Origin 3800, IBM Nighthawk
        2 SMP, and Cray T3E-1200."
}
@Article{PAFar2003,
       author   = "P. A. Farrell and H. Ong",
       title    = "Factors involved in the performance of computations on {B}eowulf clusters",
       journal  = "Electronic Transactions on Numerical Analysis",
       volume   = "15",
       number   = "pp. 211-224.",
       pages    = "",
       month    = "",
       year     = "2003",
       abstract = "We comment on the relative performance of LAM, MPICH, and MVICH
        on a Linux cluster connected by a Gigabit Ethernet network.
        Since LAM and MPICH use the TCP/IP socket interface for
        communicating messages, it is critical to have high TCP/IP
        performance for these to give satisfactory results. Despite many
        efforts to improve TCP/IP performance, the performance graphs
        presentedhere indicate that the overhead incurred in protocol
        stack processing is still high. We discuss the Virtual Interface
        Architecture ( VIA) which is intended to provide low latency,
        high bandwidth message-passing between user processes.
        Developments such as the VIA-based MPI implementation MVICH can
        improve communication throughput and thus give the promise of
        enabling distributed applications to improve performance.
        Finally we present some examples of how these various choices
        can impact the performance of an example multigrid code."
}

@Article{ARubi2003,
       author   = "A. Rubinstein and F. Rachidi and M. Rubinstein and B. Reusser",
       title    = "A parallel implementation of {NEC} for the analysis of large structures",
       journal  = "IEEE Transactions on Electromagnetic Compatibility",
       volume   = "45",
       number   = "2",
       pages    = "177--188",
       month    = MAY,
       year     = "2003",
       abstract = "We present a new, parallel version of the numerical
        electromagnetics code (NEC). The parallelization is based on a
        bidimensional block-cyclic distribution of matrices on a
        rectangular processor grid, assuring a theoretically optimal
        load balance among the processors. The code is portable to any
        platform supporting message passing parallel environments such
        as message passing interface and parallel virtual machine, where
        it could even be executed on heterogeneous clusters of computers
        running on different operating systems. The developed parallel
        NEC was successfully implemented on two parallelsupercomputers
        featuring different architectures to test portability. Large
        structures containing up to 24000 segments, which exceeds
        currently available computer resources were successfully
        executed and timing and memory results are presented. The code
        is applied to analyze the penetration of electromagnetic fields
        inside a vehicle. The computed results are validated using other
        numerical methods and experimental data obtained using a
        simplified model of a vehicle (consisting essentially of the
        body shell) illuminatedby an electromagnetic pulse (EMP)
        simulator."
}
@Article{JCNoa2003,
       author   = "J. C. No and R. Thakur and A. Choudhary",
       title    = "High-performance scientific data management system",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "63",
       number   = "4",
       pages    = "434--447",
       month    = APR,
       year     = "2003",
       abstract = "Many scientific applications have large I/O requirements, in
        terms of both the size of data and the number of files or data
        sets. Management, storage,efficient access, and analysis of this
        data present an extremely challenging task. Traditionally, two
        different solutions have been used for this task: file I/O or
        databases. File I/O can provide high performance but is tedious
        to use with large numbers of files and large and complex data
        sets. Databases can be convenient, flexible, and powerful but do
        not perform and scale well for parallel supercomputing,
        applications. We have developed a software system, called
        Scientific Data Manager (SDM), that combines the good features
        of both file I/O and databases. SDM provides a high-level
        application programming interface to the user and, internally,
        uses a parallel file system to store real data (using various
        I/O optimizations available in MPI-IO) and a database to store
        application-related metadata. In order to support I/O in
        irregular applications, SDM makes extensive use of MPI-IO's
        noncontiguous collective I/O functions. Moreover, SDM uses the
        concept of a history file to optimize the cost of the index
        distribution using the metadatastored in database. We describe
        the design and implementation of SDM and present performance
        results with two regular applications, ASTRO3D and an Euler
        solver, and with two irregular applications, a CFD code called
        FUN3D and a Rayleigh-Taylor instability code."
}
@Article{PLina2003,
       author   = "P. Lin and Q. P. Guo and X. Q. Chen",
       title    = "A fully explicit method for incompressible flow computation",
       journal  = "Computer Methods in Applied Mechanics and Engineering",
       volume   = "192",
       number   = "22-24",
       pages    = "2555--2564",
       month    = "",
       year     = "2003",
       abstract = "A new formulation of the Navier-Stokes equations is introduced
        to solve incompressible flow problems. It keeps the benefits of
        the penalty method, that is, velocity and pressure can be
        obtained separately and no pressure-Poisson equation is
        involved. Unlike the penalty method the formulation is
        morestable or less stiff and then explicit time integration can
        be applied foreasy implementation. No linear or nonlinear system
        need be solved in the method. In the case that a large number of
        time steps are needed a parallelization based on domain
        decomposition is applied to reduce the computationaltime. With
        the explicit time integration the parallel implementation and
        its message passing are very simple as well. "
}
@Article{SDShe2003,
       author   = "S. D. Shellman and J. P. Lewis and K. R. Glaesemann and K. Sikorski and G. A. Voth",
       title    = "Massively parallel linear-scaling algorithm in an ab initio    local-orbital total-energy method",
       journal  = "Journal of Computational Physics",
       volume   = "188",
       number   = "1",
       pages    = "1--15",
       month    = JUN,
       year     = "2003",
       abstract = {Similar to the manner of S. Itoh et al. [Comp. Phys. Commun. 88
        (1995) 173], we report implementation of a massively parallel
        linear-scaling algorithminto an ab initio tight-binding method
        called FIREBALL [Phys. Rev. B (2001)]. The use of local-orbitals
        yields a very sparse Hamiltonian matrix whichfacilitates using a
        linear-scaling algorithm to obtain the electronic band-structure
        energy. The general functional form of Kim et al. [Phys. Rev. B
        52 (1995) 1640], which minimizes a functional to obtain the
        electronic band-structure energy, has been parallelized
        utilizing the conjugate gradient method. The results of this
        approach are reported here. In addition, the useof "fireball"
        wavefunctions, where the wavefunctions are explicitly zero
        beyond some cutoff, allows for pre-generating all integrals
        describing two- and three-center interactions. The computation
        of these integrals is then an easily parallelizable problem for
        which the results are reported. Both integral generation and the
        linear-scaling optimization procedures are parallelized using
        the standard MPI message passing interface mixed with an OpenMP
        strategy.}
}
@Article{NTKar2003,
       author   = "N. T. Karonis and B. Toonen and I. Foster",
       title    = "M{PICH}-{G}2: {A} {G}rid-enabled implementation of the {M}essage {P}assing {I}nterface",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "63",
       number   = "5",
       pages    = "551--563",
       month    = MAY,
       year     = "2003",
       abstract = {Application development for distributed-computing "Grids" can
        benefit from tools that variously hide or enable
        application-level management of critical aspects of the
        heterogeneous environment. As part of an investigation of these
        issues, we have developed MPICH-G2, a Grid-enabled
        implementation of the Message Passing Interface (MPI) that
        allows a user to run MPI programs across multiple computers, at
        the same or different sites, using the same commands that would
        be used on a parallel computer. This library extends theArgonne
        MPICH implementation of MPI to use services provided by the
        GlobusToolkit for authentication, authorization, resource
        allocation, executablestaging, and I/O, as well as for process
        creation, monitoring, and control. Various performance-critical
        operations, including startup and collectiveoperations, are
        configured to exploit network topology information. The library
        also exploits MPI constructs for performance management; for
        example, the MPI communicator construct is used for
        application-level discovery of,and adaptation to, both network
        topology and network quality-of-service mechanisms. We describe
        the MPICH-G2 design and implementation, present performance
        results, and review application experiences, including
        record-setting distributed simulations.}
}
@Article{RLGra2003,
       author   = "R. L. Graham and S. E. Choi and D. J. Daniel and N. N. Desai and R. G. Minnich and  Rasmussen",
       title    = "A network-failure-tolerant message-passing system for terascale clusters",
       journal  = "International Journal of Parallel Programming",
       volume   = "31",
       number   = "4",
       pages    = "285--303",
       month    = AUG,
       year     = "2003",
       abstract = "The Los Alamos Message Passing Interface (LA-MPI) is an
        end-to-end network-failure-tolerant message-passing system
        designed for terascale clusters. LA-MPI is a standard-compliant
        implementation of MPI designed to tolerate network-related
        failures including I/O bus errors, network card errors, and
        wire-transmission errors. This paper details the distinguishing
        features of LA-MPI, including support for concurrent use of
        multiple types of network interface, and reliable message
        transmission utilizing multiple network pathsand routes between
        a given source and destination. In addition, performance
        measurements on production-grade platforms are presented."
}
@Article{JLoho2003,
       author   = "J. Lohout and A. D. George",
       title    = "A high-performance communication service for parallel computing on    distributed {DSP} systems",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "7",
       pages    = "851--878",
       month    = JUL,
       year     = "2003",
       abstract = "Rapid increases in the complexity of algorithms for real-time
        signal processing applications have led to performance
        requirements exceeding the capabilities of conventional digital
        signal processor (DSP) architectures. Many applications, such as
        autonomous sonar arrays, are distributed in nature andamenable
        to parallel computing on embedded systems constructed from
        multiple DSPs networked together. However, to realize the full
        potential of such applications, a lightweight service for
        message-passing communication and parallel process coordination
        is needed that is able to provide high throughput and low
        latency while minimizing processor and memory utilization.
        Thispaper presents the design and analysis of such a service,
        based on the message passing interface specification, for
        unicast and collective communications. "
}
@Article{XYZhu2003,
       author   = "X. Y. Zhu and L. Carin and T. Dogaru",
       title    = "Parallel implementation of the biorthogonal multiresolution time-domain    method",
       journal  = "Journal of the Optical Society of America A-Optics Image      Science and Vision",
       volume   = "20",
       number   = "5",
       pages    = "844--855",
       month    = MAY,
       year     = "2003",
       abstract = "The three-dimensional biorthogonal multiresolution time-domain
        (Bi-MRTD) method is presented for both free-space and half-space
        scattering problems. The perfectly matched layer (PML) is used
        as an absorbing boundary condition. It has been shown that
        improved numerical-dispersion properties can be obtained with
        the use of smooth, compactly supported wavelet functions as
        thebasis, whereas we employ the Cohen-Daubechies-Fouveau (CDF)
        biorthogonal wavelets. When a CDF-wavelet expansion is used, the
        spatial-sampling rate can be reduced considerably compared with
        that of the conventional finite-difference time-domain (FDTD)
        method,. implying that larger targets can be simulated without
        sacrificing accuracy. We implement the Bi-MRTD on a cluster of
        allocated-memory machines, using the message-passing interface
        (MPI), such that very large targets can be modeled. Numerical
        results are compared with analytical ones and with those
        obtained by use of the traditional FDTD method."
}
@Article{PRAme2003,
       author   = "P. R. Amestoy and I. S. Duff and J. Y. L'Excellent and X. Y. S. Li",
       title    = "Impact of the implementation of {MPI} point-to-point communications on    the performance of two general sparse solvers",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "7",
       pages    = "833--849",
       month    = JUL,
       year     = "2003",
       abstract = "We examine the send and receive mechanisms of MPI and show how
        to implementmessage passing robustly so that performance is not
        significantly affectedby changes to the MPI system. We discuss
        this within the context of two different parallel algorithms for
        sparse Gaussian elimination: a multifrontalsolver (MUMPS), and a
        supernodal one (SuperLU). The performance of our initial
        strategies based on simple MPI point-to-point communication
        primitivesis very sensitive to the MPI system, particularly the
        way MPI buffers are used. Using nonblocking communication
        primitives improves the performance and robustness, but at the
        cost of increased code complexity."
}
@Article{WSZha2003,
       author   = "W. S. Zhang and G. Q. Zhang",
       title    = "Factorization synthesized-shot prestack depth migration in the helical    coordinate system",
       journal  = "Chinese Journal of Geophysics-Chinese Edition",
       volume   = "46",
       number   = "4",
       pages    = "520--525",
       month    = JUL,
       year     = "2003",
       abstract = "Based on the synthesized-shot prestack depth migration, a new
        high efficient method for the synthesized-shot prestack depth
        migration is proposed. It is a hybrid technique which implements
        wavefield extrapolation with factorization in the helical
        coordinate system. The wavefield extrapolation is divided into
        two explicit solving processes. One is a causal process, and the
        other is an anticausal process. Such explicit solving processes
        in the helical coordinate system can improve the wavefield
        extrapolation efficiency. Moreover, based on the phase encoding
        principle, the synthesized-wavefield corresponding to multiple
        ray parameters is encoded and stacked. Then the calculations are
        implemented with MPI parallel algorithm based on the ray
        parameters. Thus the calculation efficiency is enhanced further.
        After deriving the relevant formulae and analyzing the
        computation cost quantitatively, numerical calculations for the
        Marmousi complex model are carried out and results comparisons
        are made. The imaging results show that the method presented in
        this paper has the advantages of high precision and good
        efficiency. So it can be applied to practical data processing."
}
@Article{GRLue2003,
       author   = "G. R. Luecke and M. Kraeva and L. L. Ju",
       title    = "Comparing the performance of {MPICH} with {C}ray's {MPI} and with {SGI}'s {MPI}",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "15",
       number   = "9",
       pages    = "779--802",
       month    = AUG,
       year     = "2003",
       abstract = "The purpose of this paper is to compare the performance of MPICH
        with the vendor Message Passing Interface (MPI) on a Cray
        T3E-900 and an SGI Origin 3000. Seven basic communication tests
        which include basic point-to-point andcollective MPI
        communication routines were chosen to represent commonly-used
        communication patterns. Cray's MPI performed better (and
        sometimes significantly better) than Mississippi State
        University's (MSU's) MPICH for small and medium messages. They
        both performed about the same for large messages, however for
        three tests MSU's MPICH was about 20\\% faster than Cray's MPI.
        SGI's MPI performed and scaled better (and sometimes
        significantly better) than MPICH for all messages, except for
        the scatter test where MPICH outperformed SGI's MPI for 1 kbyte
        messages. The poor scalability of MPICH on the Origin 3000
        suggests there may be scalability problems with MPICH. "
}
@Article{JHKri2003,
       author   = "J. H. Kristensen and I. Farnan",
       title    = "Efficient solid state {NMR} powder simulations using {SMP} and {MPP} parallel    computation",
       journal  = "Journal of Magnetic Resonance",
       volume   = "161",
       number   = "2",
       pages    = "183--190",
       month    = APR,
       year     = "2003",
       abstract = "Methods for parallel simulation of solid state NMR powder
        spectra are presented for both shared and distributed memory
        parallel supercomputers. For shared memory architectures the
        performance of simulation programs implementing the OpenMP
        application programming interface is evaluated. It is
        demonstrated that the design of correct and efficient shared
        memory parallel programs is difficult as the performance depends
        on data locality and cache memory effects. The distributed
        memory parallel programming model is examined for simulation
        programs using the MPI message passing interface. The
        resultsreveal that both shared and distributed memory parallel
        computation are very efficient with an almost perfect
        application speedup and may be applied to the most advanced
        powder simulations. "
}
@Article{NTKar2003b,
       author   = "N. T. Karonis and M. E. Papka and J. Binns and J. Bresnahan and J. A. Insley and D. Jones and  Link",
       title    = "High-resolution remote rendering of large datasets in a collaborative    environment",
       journal  = "Future Generation Computer Systems",
       volume   = "19",
       number   = "6",
       pages    = "909--917",
       month    = AUG,
       year     = "2003",
       abstract = "In a time when computational and data resources are distributed
        around the globe, users need to interact with these resources
        and each other easily and efficient. The Grid, by definition,
        represents a connection of distributed resources that can be
        used regardless of the user's location. We have built a
        prototype visualization system using the Globus Toolkit,
        MPICH-G2, andthe Access Grid in order to explore how future
        scientific collaborations may occur over the Grid. We describe
        our experience in demonstrating our system at iGrid2002, where
        the United States and the Netherlands were connected via a
        high-latency, high-bandwidth network. In particular, we focus on
        issues related to a Grid-based application that couples a
        collaboration component (including a user interface to the
        Access Grid) with a high-resolutionremote rendering component."
}
@Article{RVald2003,
       author   = "R. Valdarnini",
       title    = "Parallelization of a treecode",
       journal  = "New Astronomy",
       volume   = "8",
       number   = "7",
       pages    = "691--710",
       month    = SEP,
       year     = "2003",
       abstract = "I describe here the performance of a parallel treecode with
        individual particle timesteps. The code is based on the
        Barnes-Hut algorithm and runs cosmological N-body simulations on
        parallel machines with a distributed memory architecture using
        the MPI message-passing library. For a configuration with a
        constant number of particles per processor the scalability of
        the code was tested up to P = 128 processors on an IBM SP4
        machine. In the large P limit the average CPU time per processor
        necessary for solving the gravitational interactions is similar
        to 10\\% higher than that expected from the ideal scaling
        relation. The processor domains are determined every large
        timestep according to a recursive orthogonal bisection, using a
        weighting scheme which takes into account the total particle
        computational load within the timestep. The results of the
        numerical tests show that the load balancing efficiency L of the
        code is high (greater than or similar to 90\\%) up to P = 32, and
        decreases to L similar to 80\\% when P = 128. In the latter case
        it isfound that some aspects of the code performance are
        affected by machine hardware, while the proposed weighting
        scheme can achieve a load balance as high as L similar to 90\\%
        even in the large P limit."
}
@Article{THeya2003,
       author   = "T. Hey and A. Trefethen",
       title    = "e-science and its implications",
       journal  = "Philosophical Transactions of the Royal Society of London      Series A-Mathematical Physical and Engineering Sciences",
       volume   = "361",
       number   = "1809",
       pages    = "1809--1825",
       month    = AUG,
       year     = "2003",
       abstract = "After a definition of e-science and the Grid, the paper begins
        with an overview of the technological context of Grid
        developments. NASA's Information Power Grid is described as an
        early example of a 'prototype production Grid'. The discussion
        of e-science and the Grid is then set in the context of the UK
        e-Science Programme and is illustrated with reference to some UK
        e-science projects in science, engineering and medicine. The
        Open Standards approach to Grid middleware adopted by the
        community in the Global Grid Forum is described and compared
        with community-based standardization processes used for the
        Internet, MPI, Linux and the Web. Some implications of the
        imminent data deluge that will arise from the new generation of
        e-science experiments in terms of archiving and curation are
        then considered. The paper concludes with remarks about social
        and technological issues posed by Grid-enabled 'collaboratories'
        in both scientific and commercial contexts."
}
@Article{FRonq2003,
       author   = "F. Ronquist and J. P. Huelsenbeck",
       title    = "Mr{B}ayes 3: {B}ayesian phylogenetic inference under mixed models",
       journal  = "Bioinformatics",
       volume   = "19",
       number   = "12",
       pages    = "1572--1574",
       month    = AUG,
       year     = "2003",
       abstract = "MrBayes 3 performs Bayesian phylogenetic analysis combining
        information from different data partitions or subsets evolving
        under different stochastic evolutionary models. This allows the
        user to analyze heterogeneous data sets consisting of different
        data types-e.g. morphological, nucleotide, and protein- and to
        explore a wide variety of structured models mixing
        partition-unique and shared parameters. The program employs MPI
        to parallelize Metropolis coupling on Macintosh or UNIX clusters."
}

@Article{KBLi2003,
       author   = "K. B. Li",
       title    = "Clustal{W}-{MPI}: {C}lustal{W} analysis using distributed and parallel computing",
       journal  = "Bioinformatics",
       volume   = "19",
       number   = "12",
       pages    = "1585--1586",
       month    = AUG,
       year     = "2003",
       abstract = "ClustalW is a tool for aligning multiple protein or nucleotide
        sequences. The alignment is achieved via three steps: pairwise
        alignment, guide-tree generation and progressive alignment.
        ClustalW-MPI is a distributed and parallel implementation of
        ClustalW. All three steps have been parallelized to reduce the
        execution time. The software uses a message-passing library
        called MPI (Message Passing Interface) and runs on distributed
        workstation clusters as well as on traditional parallel
        computers."
}
@Article{SGoed2003,
       author   = "S. Goedecker and M. Boulet and T. Deutsch",
       title    = "An efficient 3-dim {FFT} for plane wave electronic structure calculations    on massively parallel machines composed of multiprocessor nodes",
       journal  = "Computer Physics Communications",
       volume   = "154",
       number   = "2",
       pages    = "105--110",
       month    = AUG,
       year     = "2003",
       abstract = "Three-dimensional Fast Fourier Transforms (FFTs) are the main
        computationaltask in plane wave electronic structure
        calculations. Obtaining a high performance on a large numbers of
        processors is non-trivial on the latest generation of parallel
        computers that consist of nodes made up of a shared memory
        multiprocessors. A non-dogmatic method for obtaining high
        performance for such 3-dim FFTs in a combined MPI/OpenMP
        programming paradigm will be presented. Exploiting the
        peculiarities of plane wave electronic structure calculations,
        speedups of up to 160 and speeds of up to 130 Gflops were
        obtained on 256 processors. "
}
@Article{BGLar2003,
       author   = "B. G. Larwood and N. P. Weatherill and O. Hassan and K. Morgan",
       title    = "Domain decomposition approach for parallel unstructured mesh generation",
       journal  = "International Journal for Numerical Methods in      Engineering",
       volume   = "58",
       number   = "2",
       pages    = "177--188",
       month    = SEP,
       year     = "2003",
       abstract = "In this paper, a method to generate large unstructured meshes on
        parallel computers is demonstrated. Using the Message Passing
        Interface, a coarse-grained parallel harness has been developed,
        that allows the use of sequentialgenerators in a parallel
        environment. Meshes of over 500 million elements will be shown."
}
@Article{RGJia2004,
       author   = "R. G. Jia and B. Sunden",
       title    = "Parallelization of a multi-blocked {CFD} code via three strategies for    fluid flow and heat transfer analysis",
       journal  = "Computers \& Fluids",
       volume   = "33",
       number   = "1",
       pages    = "57--80",
       month    = JAN,
       year     = "2004",
       abstract = "This paper reports on a parallel implementation of a general 3D
        multi-blockCFD code. The parallelization is achieved by using
        three strategies. Firstly, it is done on dual-processor
        PC-clusters where Windows NT systems are running. A multi-thread
        programming model is adopted for the multi-block code, where one
        thread corresponds to a block. Shared-memory is used for the
        exchange of inner-boundaries between neighboring blocks
        (threads) on the same node, while WinSockets are employed for
        those on different nodes. Secondly, the parallelization is
        extended to UNIX operating system. MPI is appliedfor all the
        message passing between different processors, including those on
        the same node. Thirdly, Pthreads (POSIX threads), a standardized
        application interface for threads, are adopted to take the
        advantage of the shared-memory feature of the SMP nodes, while
        MPI is only applied for the messagepassing between processors on
        different nodes. In all the strategies, a static load-balancing
        method is employed for equitable distribution of computational
        work to specified nodes. The parameters of the present code is
        studied in detail to facilitate the explanation of the speedup
        results. Two examples are provided to show the speedup and load
        balancing of the parallel calculation. Detailed comparison is
        made to evaluate the efficiency of different strategies."
}
@Article{HLTru2003,
       author   = "H. L. Truong and T. Fahringer",
       title    = "S{CALEA}: a performance analysis tool for parallel programs",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "15",
       number   = "11-12",
       pages    = "1001--1025",
       month    = SEP,
       year     = "2003",
       abstract = "In this paper we present SCALEA, which is a performance
        instrumentation, measurement, analysis, and visualization tool
        for parallel programs that supports post-mortem performance
        analysis. SCALEA currently focuses on performance analysis for
        OpenMP, MPI, HPF, and mixed parallel programs. It computesa
        variety of performance metrics based on a novel classification
        of overhead. SCALEA also supports multi-experiment performance
        analysis that allows one to compare and to evaluate the
        performance outcome of several experiments. A highly flexible
        instrumentation and measurement system is provided which can be
        controlled by command-line options and program directives.
        SCALEA can be interfaced by external tools through the provision
        of a full Fortran90 OpenMP/MPI/HPF frontend that allows one to
        instrument an abstract syntax tree at a very high-level with
        C-function calls and to generate source code. A graphical user
        interface is provided to view a large variety of performance
        metrics at the level of arbitrary code regions, threads,
        processes,and computational nodes for single- and
        multi-experiment performance analysis."
}
@Article{MLang2003,
       author   = "M. Langlais and G. Latu and J. Roman and P. Silan",
       title    = "Performance analysis and qualitative results of an efficient parallel    stochastic simulator for a marine host-parasite system",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "15",
       number   = "11-12",
       pages    = "1133--1150",
       month    = SEP,
       year     = "2003",
       abstract = "We are interested in a host-parasite system, i.e. the sea
        bass-Diplectanum aequans system. A discrete mathematical model
        is used to describe the dynamics of both populations. Our goal
        is notably to validate the model in the context of aquaculture.
        A deterministic numerical simulator and, recently, astochastic
        simulator were developed to study this biological system.
        Parallelization is required because the execution times are too
        long. The Monte Carlo algorithm of the stochastic simulator and
        its three levels of parallelism are described. Analysis and
        performances, up to 256 processors, of a hybrid MPI/OpenMP code
        are then presented for a cluster of symmetric multi-processor
        (SMP) nodes. Qualitative results are given for the
        host-macroparasite system simulation."
}
@Article{PDMic2003,
       author   = "P. D. Michailidis and K. G. Margaritis",
       title    = "Performance evaluation of load balancing strategies for approximate    string matching application on an {MPI} cluster of heterogeneous    workstations",
       journal  = "Future Generation Computer Systems",
       volume   = "19",
       number   = "7",
       pages    = "1075--1104",
       month    = OCT,
       year     = "2003",
       abstract = "In this paper, we present three parallel approximate string
        matching methods on a parallel architecture with heterogeneous
        workstations to gain supercomputer power at low cost. The first
        method is the static master-worker with uniform distribution
        strategy, the second one is the dynamic master-worker with
        allocation of subtexts and the third one is the dynamic
        master-worker with allocation of text pointers. Further, we
        propose a hybrid parallel method that combines the advantages of
        static and dynamic parallel methods in order to reduce the load
        imbalance and communication overhead. This hybrid method is
        based on the following optimal distribution strategy: the text
        collection is distributed proportional to workstation's speed.
        We evaluatedand compared the performance of the four methods
        with clusters one, two, four, six and eight heterogeneous
        workstations. The experimental results demonstrate that dynamic
        allocation of text pointers and hybrid methods achieve better
        performance than the two original ones. We also present an
        analytical performance model for the four methods that confirms
        the actual behaviour of the experimental results. "
}
@Article{MGove2003,
       author   = "M. Govett and L. Hart and T. Henderson and J. Middlecoff and D. Schaffer",
       title    = "The {S}calable {M}odeling {S}ystem: directive-based code parallelization for    distributed and shared memory computers",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "8",
       pages    = "995--1020",
       month    = AUG,
       year     = "2003",
       abstract = "A directive-based parallelization tool called the Scalable
        Modeling System (SMS) is described. The user inserts directives
        in the form of comments into existing Fortran code. SMS
        translates the code and directives into a parallel version that
        runs efficiently on shared and distributed memory
        high-performance computing platforms including the SGI Origin,
        IBM SP2, Cray T3E, Sun, and Alpha and Intel clusters. Twenty
        directives are available to support operations including array
        re-declarations, inter-process communications, loop
        translations, and parallel I/O operations. SMS also provides
        tools to support incremental parallelization and debugging that
        significantly reduces code parallelization. time from months to
        weeks of effort. SMS is intended for applications using regular
        structured grids that are solved using finite difference
        approximation or spectral methods. It has been used to
        parallelize 10 atmospheric and oceanic models, but the tool is
        sufficiently general that it can be applied to other structured
        grids codes. Recent performance comparisons demonstrate that the
        Eta, Hybrid Coordinate Ocean model and Regional Ocean Modeling
        System model, parallelized using SMS, perform aswell or better
        than their OpenMP or Message Passing Interface counterparts. "
}
@Article{MYama2003,
       author   = "M. Yamashita and K. Fujisawa and M. Kojima",
       title    = "S{DPARA}: {S}emi{D}efinite {P}rogramming {A}lgorithm pa{RA}llel version",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "8",
       pages    = "1053--1067",
       month    = AUG,
       year     = "2003",
       abstract = "The SDPA (SemidDefinite Programming Algorithm) is known as
        efficient computer software based on the primal-dual
        interior-point method for solving SDPs(SemiDefinite Programs).
        In many applications, however, some SDPs become larger and
        larger, too large for the SDPA to solve on a single processor.
        Inexecution of the SDPA applied to large scale SDPs, the
        computation of the so-called Schur complement matrix and its
        Cholesky factorization consume most of the computational time.
        The SDPARA (SemiDefinite Programming Algorithm paRAllel version)
        is a parallel version of the SDPA on multiple processors and
        distributed memory, which replaces these two parts by their
        parallel implementation using MPI and ScaLAPACK. Through
        numerical results, we show that the SDPARA on a PC cluster
        consisting of 64 processors attains high scalability for large
        scale SDPs without losing the stability of the SDPA."
}
@Article{NSaka,
       author   = "N. Sakai and N. Hata and H. Liao and T. Dohi",
       title    = "High performance computing for parallel rendering in surgical auto    stereoscopic display and navigation",
       journal  = "Cars 2003: Computer Assisted Radiology and Surgery",
       volume   = "2003",
       number   = "1256",
       pages    = "403--407",
       OPTnote = "Proceedings",
       year     = "",
       abstract = "The three-dimensional medical information obtained from Magnetic
        Resonance Image (MRI), Xray Computed Tomography (CT), etc. is
        used for an operation-supporting image, and operations under
        image guidance have also been conducted. One of the stereoscopic
        methods is Integral Videography (IV), which is an animated
        extension of Integral Photography, reproducing a
        computer-generated graphical object. Though the advantage of IV
        has been proven in both feasibility studies and clinical
        applications, one of the issues still unsolved is the notable
        quantity of calculation that causes significant delay
        inrendering. Then using parallel processing method integrating
        Message Passing Inter-face (MPI) on High Performance Computer
        (HPC), we shortened the calculating time of IV picture by the
        shortest at about 0.2 s. Furthermore, by using socket
        communication, it transmitted to another portable note PC, which
        is for a display. Then, we achieved the fast presentation of
        autostereoscopic images seen from an arbitrary direction that
        was specified with themouse from the note PC side at anywhere we
        can access to the network."
}
@Article{XZFen2003,
       author   = "X. Z. Feng and D. A. Buell and J. R. Rose and P. J. Waddell",
       title    = "Parallel algorithms for {B}ayesian phylogenetic inference",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "63",
       number   = "7-8",
       pages    = "707--718",
       month    = JUL-AUG,
       year     = "2003",
       abstract = "This paper describes parallel algorithms and their MPI-based
        parallel implementation for MCMC-based Bayesian phylogenetic
        inference. Bayesian phylogenetic inference is computationally
        expensive both in time and in memory requirements. Our
        variations on MCMC and their implementation were done to permit
        the study of large phylogenetic problems. In our approach, we
        can distribute either entire chains or parts of a chain to
        different processors, since in current models the columns of the
        data are independent. Evaluations ona 32-node Beowulf cluster
        suggest the problem scales well. A number of important points
        are identified, including a superlinear speedup due to more
        effective cache usage and the point at which additional
        processors slow downthe process due to communication overhead."
}
@Article{CFerr2003,
       author   = "C. Ferrari and C. Guerra and G. Zanotti",
       title    = "A grid-aware approach to protein structure comparison",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "63",
       number   = "7-8",
       pages    = "728--737",
       month    = JUL-AUG,
       year     = "2003",
       abstract = "This paper concentrates on the grid implementation of software
        tools for the comparison of protein structures. We have
        developed comparison algorithmsbased on indexing techniques that
        store transformation invariant properties of the 3D protein
        structures into tables. The method has large memory requirements
        and is computationally intensive. Furthermore, the dataset needs
        frequent updates as new proteins are added to the Protein Data
        Bank. Thus asignificant advantage is obtained from a
        computational framework such as agrid. We report on a
        distributed implementation of the matching procedureson a grid
        using Globus MPI-CH, focusing on the data partition strategy to
        achieve good load balancing and to minimize the number of
        secondary memory accesses of the out-of-core computation."
}
@Article{YHuan2003,
       author   = "Y. Huang and H. G. Sung and S. Y. Hsieh and V. G. Yang",
       title    = "Large-eddy simulation of combustion dynamics of lean-premixed    swirl-stabilized combustor",
       journal  = "Journal of Propulsion and Power",
       volume   = "19",
       number   = "5",
       pages    = "782--794",
       month    = SEP-OCT,
       year     = "2003",
       abstract = "A comprehensive numerical study of the combustion dynamics in a
        lean-premixed swirl-stabilized combustor is described. The
        analysis treats the conservation equations in three dimensions
        and takes into account finite-rate chemical reactions and
        variable thermophysical properties. Turbulence closure is
        achieved using a large-eddy-simulation technique. The
        compressible-flow version of the Smagorinsky model is employed
        to describe subgrid-scale turbulent motions and their effect on
        large-scale structures. A level-set flamelet library approach is
        used to simulate premixed turbulent combustion. The governing
        equations and the associated boundary conditions are solved by
        means of a four-step Runge-Kutta scheme along with
        implementation of the message passing interface parallel
        computing architecture. The analysis allows for a detailed
        investigation into the interaction between turbulent flow
        motions and oscillatory combustion of a swirl-stabilized
        combustor. Several physical processes responsible for driving
        combustion instabilities in the chamber have been identified and
        quantified, including the mutual coupling between acoustic wave
        motions, vortex shedding, and flame oscillations. In particular,
        the mechanisms of energy transfer from chemical reactions in
        theflame zone to acoustic motions in the bulk of chamber are
        carefully studied."
}
@Article{JHara2003,
       author   = "J. Har and R. E. Fulton",
       title    = "A parallel finite element procedure for contact-impact problems",
       journal  = "Engineering with Computers",
       volume   = "19",
       number   = "2-3",
       pages    = "67--84",
       month    = "",
       year     = "2003",
       abstract = "An efficient parallel finite element procedure for
        contact-impact problems is presented within the framework of
        explicit finite element analysis with the penalty method. The
        procedure concerned includes a parallel Belytschko-Lin-Tsay
        shell element generation algorithm and a parallel contact-impact
        algorithm based on the master-slave slideline algorithm. An
        element-wise domain decomposition strategy and a communication
        minimization strategy are featured to achieve almost perfect
        load balancing among processors and to show scalability of the
        parallel performance. Throughout this work, a prototype code,
        named GT-PARADYN, is developed on the IBM SP2 to implement the
        procedure presented, under message-passing paradigm. Some
        examples are providedto demonstrate the timing results of the
        algorithms, discussing the accuracy and efficiency of the code."
}
@Article{AGupt2003,
       author   = "A. Gupta and R. Ganguly and S. Chakraborty and C. Mazumdar and D. Popovic",
       title    = "Simulating thermal power plant processes on a message passing    environment",
       journal  = "Isa Transactions",
       volume   = "42",
       number   = "4",
       pages    = "615--630",
       month    = OCT,
       year     = "2003",
       abstract = "Simulators play a very important role in the operation of
        thermal power plants and also in the design of control systems
        for these plants. To cater tothis requirement elaborate
        methodologies have been developed to simulate thermal power
        plant processes in an interactive way. Due to the intensive
        computations involved, such simulators use one or more, high
        through-put computers known as the simulation computers. This
        paper puts forward a method where parallel processing on a low
        latency message passing environment has been used to simulate
        thermal power plant processes following a modular approach. This
        eliminates the need of an expensive high through-put simulation
        computer, thus cutting down the hardware cost associated with a
        simulator and increasing the system reliability manifold. "
}
@Article{MBeck2003,
       author   = "M. Becka and G. Oksa",
       title    = "On variable blocking factor in a parallel dynamic block-{J}acobi {SVD}    algorithm",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "9",
       pages    = "1153--1174",
       month    = SEP,
       year     = "2003",
       abstract = "The parallel two-sided block-Jacobi singular value decomposition
        (SVD) algorithm with dynamic ordering, originally proposed in
        [Parallel Comput. 28 (2002) 243-262], has been extended with
        respect to the blocking factor. Unlike the unique blocking
        factor l = 2p in the original algorithm running on p processors,
        the current blocking factor is a variable parameter that
        coversthe values in two different regions-namely, l = p/k and l
        = 2kp for some integer k. Two new parallel two-sided
        block-Jacobi SVD algorithms with dynamic ordering are described
        in detail. They arise in those two regions and differ in the
        logical data arrangement and communication complexity of the
        reordering step. For the case of l = 2kp, it is proved that a
        designed point-to-point communication algorithm is optimal with
        respect to the amount of communication required per processor as
        well as to the amount of overall communication. Using the
        message passing programming model for distributed memory
        machines, new parallel block-Jacobi SVD algorithms were
        implemented on an SGI-Cray Origin 2000 parallel computer.
        Numerical experiments were performed on p = 12 and 24 processors
        using a set of six matrices of order 4000 and blocking factors
        l, 2 less than or equal to l less than or equal to 192. To
        achieve the minimal total parallel execution time, the use of a
        blocking factor l is an element of {2,p,2p} can be recommended
        for matrices with distinct singular values. However, for
        matrices with a multiple minimal singular value, the total
        parallel execution time may monotonically increase with l. In
        this case, the recommended Jacobi method with l = 2 is just the
        ScaLAPACK routine with some additional matrix multiplications,
        and it computes the SVD in one parallel iteration step. "
}
@Article{IHida2003,
       author   = "I. Hidajat and M. Singh and K. K. Mohanty",
       title    = "Nmr response of porous media by random walk algorithm: {A} parallel    implementation",
       journal  = "Chemical Engineering Communications",
       volume   = "190",
       number   = "12",
       pages    = "1661--1680",
       month    = DEC,
       year     = "2003",
       abstract = "NMR well logging is a popular tool in the petroleum industry to
        estimate porosity, specific surface area, and permeability of
        porous media. In this study, a random walk algorithm is used to
        simulate the NMR response of porous, water-saturated media,
        which, in turn, probes the relation between microstructure and
        transport. The serial implementation of the random walk
        algorithm is computationally very intensive for large porous
        samples. A parallel random walk code is developed using Message
        Passing Interface (MPI) in Fortran. Various domain decomposition
        techniques are implemented. The walker distribution across
        processors without domain decomposition gives the best speedup.
        The domain decomposition with overlapped layers requires smaller
        processor memory. Increasing the overlap between adjacent
        domains lowers the interprocessor communication and leads to
        improved speedup. For the given parameters, an overlap of two
        layers was found to be optimal. Domain decomposition along the z
        direction was found to be more effective than decomposition
        along either the x or y direction. By using the parallel random
        walk code, we are able to solve a 256x256x256 voxel system in
        less than 8 h using 32 processors on an IBM SP2 machine."
}
@Article{HHaba2003,
       author   = "H. Habata and M. Yokokawa and S. Kitawaki",
       title    = "The development of the {E}arth {S}imulator",
       journal  = "Ieice Transactions on Information and Systems",
       volume   = "E86D",
       number   = "10",
       pages    = "1947--1954",
       month    = OCT,
       year     = "2003",
       abstract = {The Earth Simulator (ES), developed by the Japanese
        governinent's initiative "Earth Simulator project," is a highly
        parallel vector supercomputer system. In May 2002, the ES was
        proven to be the most powerful computer in the world by
        achieving 35.86 teraflops oil the UNPACK benchmark and 26.58
        teraflops for a global atmospheric circulation model with the
        spectral method. Three architectural features enabled these
        great achievements; vector processor, shared-memory and
        high-bandwidth noli-blocking interconnection crossbar network.
        In this paper, all overview of the ES, the three architectural
        features and the result of performance evaluation are described
        particularlywith its hardware realization of the interconnection
        among 640 processor nodes.}
}
@Article{ATChr2003,
       author   = "A. T. Chronopoulos and D. Grosu and A. M. Wissink and M. Benche and J. Y. Liu",
       title    = "An efficient 3{D} grid based scheduling for heterogeneous systems",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "63",
       number   = "9",
       pages    = "827--837",
       month    = SEP,
       year     = "2003",
       abstract = "The cost/performance ratio of networks of workstations has been
        constantly improving. This trend is expected to continue in the
        near future. The aggregate peak rate of such systems often
        matches or exceeds the peak rate offered by the fastest parallel
        computers. This has motivated research toward using a network of
        computers, interconnected via a fast network (cluster system) or
        a simple Local Area Network (LAN) (distributed system), for high
        performance concurrent computations. Some of the important
        research issues arise such as (i) Problem partitioning and
        virtual interconnection topology mapping; (ii) Execution
        scheduling and load balancing. Past results exist forgrid
        partitioning (into subdomains) and mapping to parallel and
        distributed systems. In our work we consider the problem of grid
        partitioning of a 3Ddomain arising in aircraft CFD simulations
        in order to schedule tasks for load balanced execution on a
        heterogeneous distributed system. This problemhas additional
        restrictions on how to partition the grid. Past work for this
        problem were on parallel systems with only few processor
        configurations.We derive heuristic algorithms for: (1)
        homogeneous systems with any number of processors; (2)
        heterogeneous systems taking into account the processor speed
        and memory capacity. We implement our algorithms on a dedicated
        network of workstations (using MPI) and test them with a CFD
        simulation code (TURNS-Transonic Unsteady Rotor Navier Stokes)."
}
@Article{JSVet2003,
       author   = "J. S. Vetter and F. Mueller",
       title    = "Communication characteristics of large-scale scientific applications    for contemporary cluster architectures",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "63",
       number   = "9",
       pages    = "853--865",
       month    = SEP,
       year     = "2003",
       abstract = "This paper examines the explicit communication characteristics
        of several sophisticated scientific applications, which, by
        themselves, constitute a representative suite of publicly
        available benchmarks for large cluster architectures. By
        focusing on the message passing interface (MPI) and by using
        hardware counters on the microprocessor, we observe each
        application's inherent behavioral characteristics:
        point-to-point and collective communication, and floating-point
        operations. Furthermore, we explore the sensitivities of these
        characteristics to both problem size and number of processors.
        Ouranalysis reveals several striking similarities across our
        diverse set of applications including the use of collective
        operations, especially those collectives with very small data
        payloads. We also highlight a trend of novelapplications parting
        with regimented, static communication patterns in favor of
        dynamically evolving patterns, as evidenced by our experiments
        on applications that use implicit linear solvers and adaptive
        mesh refinement. Overall, our Study contributes a better
        understanding of the requirements of current and emerging
        paradigms of scientific computing ill terms of their computation
        and communication demands. "
}
@Article{FvanH2003,
       author   = "F. van Hees and A. J. Van der Steen and P. J. van Leeuwen",
       title    = "A parallel data assimilation model for oceanographic observations",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "15",
       number   = "13",
       pages    = "1191--1204",
       month    = NOV,
       year     = "2003",
       abstract = "In this paper we describe the development of a program that aims
        at achieving the optimal integration of observed data in an
        oceanographic model describing the water transport phenomena in
        the Agulhas area at the tip of SouthAfrica. Two parallel
        implementations, MPI and OpenMP, are described and experiments
        with respect to speed and scalability on a Compaq AlphaServer SC
        and an SGI Origin3000 are reported. "
}
@Article{MBern2003,
       author   = "M. Bernaschi and G. Iannello and M. Lauria",
       title    = "Efficient implementation of reduce-scatter in {MPI}",
       journal  = "Journal of Systems Architecture",
       volume   = "49",
       number   = "3",
       pages    = "89--108",
       month    = AUG,
       year     = "2003",
       abstract = "Our study has been carried out on two different platforms: an
        SP2 and a Myrinet interconnected cluster of Pentium PRO.
        However, most of the results reported here are not specific for
        either MPI or the platforms used, and theyhold in general for
        any message passing programming system."
}
@Article{FJSei2003,
       author   = "F. J. Seinstra and D. Koelma",
       title    = "Incorporating memory layout in the modeling of message passing programs",
       journal  = "Journal of Systems Architecture",
       volume   = "49",
       number   = "3",
       pages    = "109--121",
       month    = AUG,
       year     = "2003",
       abstract = "The effectiveness of the model is tested in a framework for
        automatic parallelization of image processing applications.
        Experiments are performed on two Beowulf-type commodity
        clusters, each having a different interconnectionnetwork, and a
        different MPI implementation. Results show that, where other
        models frequently fail, P-3PC correctly predicts the
        communication costs related to any type of domain decomposition."
}
@Article{LWLia2003,
       author   = "L. W. Li and Y. J. Wang and E. P. Li",
       title    = "M{PI}-based parallelized precorrected {FFT} algorithm for analyzing    scattering by arbitrarily shaped three-dimensional objects - {A}bstract",
       journal  = "Journal of Electromagnetic Waves and Applications",
       volume   = "17",
       number   = "10",
       pages    = "1489--1491",
       month    = "",
       year     = "2003",
       abstract = ""
}
 Parallel unsteady incompressible viscous flow computations using an
    unstructured multigrid method
      Tai, CH, JOURNAL OF COMPUTATIONAL PHYSICS, NOV 20 2003, 192, 1, pp.
      277-311.
 
      Tai, CH;Zhao, Y
 
      Nanyang Technol Univ/Singapore/Singapore/639798
      ANL libraries:   203
 
      Abstract:
        The development and validation of a parallel unstructured
        non-nested multigrid method for simulation of unsteady
        incompressible viscous flow is presented. The Navier-Stokes
        solver is based on the artificial compressibility approach and a
        higher-order characteristics-based finite-volume scheme on an
        unstructured multigrid. Unsteady flow is calculated with an
        implicit dual time-stepping scheme. The parallelization of the
        solver is achieved by multigrid domain decomposition approach
        (MG-DD), using the single program multiple data (SPMD)
        programming paradigm and message-passing interface (MPI)
        forcommunication of data. The parallel codes using single grid
        and multigrid are used to simulate steady and unsteady
        incompressible viscous flows over a circular cylinder for
        validation and performance evaluation purposes. Thespeedups and
        parallel efficiencies obtained by both the parallel single grid
        and multigrid solvers are reasonably good for both test cases,
        using up to 32 processors on the SGI Origin 2000. A maximum
        speedup of 12 could be achieved on 16 processors for
        high-Reynolds number unsteady viscous flow. The parallel results
        obtained were compared with those using serial single grid and
        multigrid codes and it remains the same as those obtained by
        serial solvers and agrees well with numerical solutions obtained
        by other researchers as well as experimental measurements. 

@Article{NKozi2003,
       author   = "N. Koziris and A. Sotiropoulos and G. Goumas",
       title    = "A pipelined schedule to minimize completion time for loop tiling with    computation and communication overlapping",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "63",
       number   = "11",
       pages    = "1138--1151",
       month    = NOV,
       year     = "2003",
       abstract = "This paper proposes a new method for the problem of minimizing
        the execution time of nested for-loops using a tiling
        transformation. In our approach, we are interested not only in
        tile size and shape according to the requiredcommunication to
        computation ratio, but also in overall completion time. We
        select a time hyperplane to execute different tiles much more
        efficientlyby exploiting the inherent overlapping between
        communication and computation phases among successive, atomic
        tile executions. We assign tiles to processors according to the
        tile space boundaries, thus considering the iteration space
        bounds. Our schedule considerably reduces overall completion
        time under the assumption that some part from every
        communication phase can be efficiently overlapped with atomic,
        pure tile computations. The overall schedule resembles a
        pipelined datapath where computations are not anymore
        interleaved with sends and receives to nonlocal processors. We
        survey the application of our schedule to modern communication
        architectures. We performed two sets of experimental results,
        one using MPI primitives over FastEthernet and one using the
        SISCI API over an SCI network. In both cases, the
        totalcompletion time is significantly reduced."
}
@Article{MLian2003,
       author   = "M. Li and D. W. Walker and O. F. Rana and Y. Huang and P. T. Williams and R. C. Ward",
       title    = "Engineering high-performance legacy codes as {CORBA} components for    problem-solving environments",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "63",
       number   = "11",
       pages    = "1152--1163",
       month    = NOV,
       year     = "2003",
       abstract = "This paper describes techniques used to leverage
        high-performance legacy codes as CORBA components to a
        distributed problem-solving environment. It first briefly
        introduces the software architecture adopted by the environment.
        Then it presents a CORBA oriented wrapper generator (COWG) which
        can be used to automatically wrap high-performance legacy codes
        as CORBA components. Two legacy codes have been wrapped with
        COWG. One is an MPI-based molecular dynamic simulation (MDS)
        code, the other is a finite element-based computational fluid
        dynamics (CFD) code for simulating incompressible Navier-Stokes
        flows. Performance comparisons between runs of the MDS CORBA
        component and the original MDS legacy code on a cluster of
        workstations and on a parallel computer are also presented.
        Wrapped as CORBA components, these legacycodes can be reused in
        a distributed computing environment. The first caseshows that
        high-performance can be maintained with the wrapped MDS
        component. The second case shows that a Web user can submit a
        task to the wrapped CFD component through a Web page without
        knowing the exact implementation of the component. In this way,
        a user's desktop computing environment can beextended to a
        high-performance computing environment using a cluster of
        workstations or a parallel computer."
}
@Article{YChen2003,
       author   = "Y. Cheng and F. S. Lien and E. Yee and R. Sinclair",
       title    = "A comparison of large {E}ddy simulations with a standard k-epsilon    {R}eynolds-averaged {N}avier-{S}tokes model for the prediction of a fully    developed turbulent flow over a matrix of cubes",
       journal  = "Journal of Wind Engineering and Industrial Aerodynamics",
       volume   = "91",
       number   = "11",
       pages    = "1301--1328",
       month    = NOV,
       year     = "2003",
       abstract = "A fully developed turbulent flow over a matrix of cubes has been
        studied using the large Eddy simulation (LES) and
        Reynolds-averaged Navier-Stokes (RANS) [more specifically, the
        standard k-epsilon model] approaches. The numerical method used
        in LES of an incompressible fluid flow was a second-order
        accurate, fully conservative discretization scheme. This scheme
        was used inconjunction with a dynamic semi-coarsening multigrid
        method applied on a staggered grid as proposed originally by Ham
        et al. (Proceedings of the Seventh Annual Conference of the
        Computational Fluid Dynamics Society of Canada, Halifax, Nova
        Scotia, Canada, 1999; J. Comput. Phys. 177 (2002) 117).
        Theeffects of the unresolved subgrid scales in LES are modeled
        using three different subgrid-scale models: namely, the standard
        Smagorinsky model; the dynamic model with time-averaging
        procedure (DMT); and, the localized dynamic model (LDM). To
        reduce the computational time, LES calculations were conducted
        on a Linux-based PC cluster using the message passing interface
        library. RANS calculations were performed using the STREAM code
        of Lien and Leschziner (Comp. Meth. Appl. Mech. Eng. 114 (1994)
        123). The Reynolds number for the present flow simulations,
        based on the mean bulk velocity and the cube height, was 3800
        which is in accordance with the experimental data of Meinders
        (Ph.D. Thesis, Faculty of Applied Sciences, Delft University of
        Technology, Delft, Netherlands, 1998). A comparison of predicted
        model results for mean flow and turbulence with the
        corresponding experimental data showed that both the LES and
        RANS approaches were able to predict the main characteristics of
        the mean flow in the array of cubes reasonably well. LES,
        particularly when used with LDM, was found to perform much
        better than RANS interms of its predictions of the spanwise mean
        velocity and Reynolds stresses. Flow structures in the proximity
        of a cube, such as separation at the sharp leading top and side
        edges of the cube, recirculation in front of the cube, and the
        arch-type vortex in the wake are captured by both the LES
        andRANS approaches. However, LES was found to give a better
        overall quantitative agreement with the experimental data than
        RANS."
}
@Article{JChee2003,
       author   = "J. Cheetham and F. Dehne and A. Rau-Chaplin and U. Stege and P. J. Taillon",
       title    = "Solving large {FPT} problems on coarse-grained parallel machines",
       journal  = "Journal of Computer and System Sciences",
       volume   = "67",
       number   = "4",
       pages    = "691--706",
       month    = DEC,
       year     = "2003",
       abstract = "Fixed-parameter tractability (FPT) techniques have recently been
        successfulin solving NP-complete problem instances of practical
        importance which were too large to be solved with previous
        methods. In this paper, we show how to enhance this approach
        through the addition of parallelism, thereby allowing even
        larger problem instances to be solved in practice. More
        precisely,we demonstrate the potential of parallelism when
        applied to the bounded-tree search phase of FPT algorithms. We
        apply our methodology to the k-VERTEXCOVER problem which has
        important applications in, for example, the analysis of multiple
        sequence alignments for computational biochemistry. We have
        implemented our parallel FPT method for the k-VERTEX COVER
        problem using C and the MPI communication library, and tested it
        on a 32-node Beowulf cluster. This is the first experimental
        examination of parallel FPT techniques. As part of our
        experiments, we solved larger instances of k-VERTEX COVER than
        in any previously reported implementations. For example, our
        code can solve problem instances with kgreater than or equal
        to400 in less than 1.5 h."
}
@Article{FWolf2003,
       author   = "F. Wolf and B. Mohr",
       title    = "Automatic performance analysis of hybrid {MPI}/{O}pen{MP} applications",
       journal  = "Journal of Systems Architecture",
       volume   = "49",
       number   = "10-11",
       pages    = "421--439",
       month    = NOV,
       year     = "2003",
       abstract = "The EXPERT performance-analysis environment provides a complete
        tracing-based solution for automatic performance analysis of
        MPI, OpenMP, or hybrid applications running on parallel
        computers with SMP nodes. EXPERT describes performance problems
        using a high level of abstraction in terms of executionpatterns
        that result from an inefficient use of the underlying
        programmingmodel(s). The set of predefined problems can be
        extended to meet application-specific needs. The analysis is
        carried out along three interconnected dimensions: class of
        performance behavior, call tree, and thread of execution. Each
        dimension is arranged in a hierarchy so that the user can
        investigate the behavior on varying levels of detail. All three
        dimensions are interactively accessible using a single
        integrated view. "
}
@Article{FChan2003,
       author   = "F. Chan and J. N. Cao and Y. D. Sun",
       title    = "High-level abstractions for message-passing parallel programming",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "11-12",
       pages    = "1589--1621",
       month    = NOV-DEC,
       year     = "2003",
       abstract = "Large-scale scientific and engineering computation problems are
        usually complex and consequently the development of parallel
        programs for solving these problems is a difficult task. In this
        paper, we describe the graph-oriented programming (GOP) model
        and environment for building and evaluating parallel
        applications. The GOP model provides higher level abstractions
        for message-passing parallel programming and the software
        environment offers toolswhich can ease programmers for
        parallelizing, writing, and deploying scientific and engineering
        computing applications. We discuss the motivations and various
        issues in developing the model and the software environment,
        present the design of the system architecture and the
        components, and describe the evaluation of the environment
        implemented on top of MPI with a sample parallel scientific
        application program. With the support of the high-level
        abstractions provided by the proposed GOP environment,
        programming of parallel applications on various parallel
        architectures can be greatly simplified. "
}
@Article{MJMar2003,
       author   = "M. J. Martin and D. E. Singh and J. C. Mourino and F. F. Rivera and R. Doallo and J. D. Bruguera",
       title    = "High performance air pollution modeling for a power plant environment",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "11-12",
       pages    = "1763--1790",
       month    = NOV-DEC,
       year     = "2003",
       abstract = "The aim of this work is to provide a high performance air
        quality simulation using the STEM-II (Sulphur Transport Eulerian
        Model 2) program, a large-scale pollution modeling application.
        First, we optimize the sequential program with the aim of
        increasing data locality. Then, we parallelized the program
        using OpenMP directives for shared memory systems, and the MPI
        libraryfor distributed memory machines. Performance results are
        presented for a SGI O2000 multiprocessor, a Fujitsu AP3000
        multicomputer and a Cluster of PCs. Experimental results show
        that the parallel versions of the code achieveimportant
        reductions in the CPU time needed by each simulation. This will
        allow us to obtain results with adequate speed and reliability
        for the industrial environment where it is intended to be
        applied."
}
@Article{BVRKu2003,
       author   = "B. V. R. Kumar and A. Quateroni and L. Formaggia and D. Lamponi",
       title    = "On parallel computation of blood flow in human arterial network based    on 1-{D} modelling",
       journal  = "Computing",
       volume   = "71",
       number   = "4",
       pages    = "321--351",
       month    = NOV,
       year     = "2003",
       abstract = "In this study, parallel computation of blood flow in a 1-D model
        of human arterial network has been carried out employing a
        Taylor Galerkin Finite Element Method. Message passing interface
        libraries have been used on Origin 2000 SGI machine. A Greedy
        strategy for load-distribution has been devised and data-flow
        graphs necessary for parallelization have been generated. The
        performance of parallel implementation measured in terms of
        speedup and efficiency factors is found to be good. Further, the
        parallel code is used in simulating the propagation of pressure
        and velocity waveforms in our 1-D arterial model for two
        different inflow pressure pulses. Also, the influence of
        consideration of terminal resistance on pressure and velocity
        waveforms have been analyzed."
}
@Article{ABalo2003,
       author   = "A. Baloch and M. F. Webster",
       title    = "Distributed parallel computation for complex rotational flows of    non-{N}ewtonian fluids",
       journal  = "International Journal for Numerical Methods in Fluids",
       volume   = "43",
       number   = "10-11",
       pages    = "1301--1328",
       month    = DEC,
       year     = "2003",
       abstract = "Complex rotational flows of non-Newtonian fluids are simulated
        through finite element methods. The predictions have direct
        relevance to dough kneading, associated with the food industry.
        The context is taken as two-dimensional and one of stirring
        material within a cylindrical vessel. Three stirrer shapes are
        considered, placed in eccentric location with respect to the
        cylinder centre. The motion is driven by the rotation of the
        outer vessel wall.Variation with change in theology and change
        in stirrer shapes are analysed, with respect to flow kinematics,
        stress fields, rate-of-work and power consumed. Computations are
        performed for Newtonian, shear-thinning and viscoelastic fluids,
        at various viscosity levels to gradually approximate more
        realistic dough-like response. For viscoelastic fluids,
        Phan-Thien/Tanner constitutive models are adopted. The numerical
        method employed is based on a finite element semi-implicit
        time-stepping Taylor-Galerkin/pressure-correction scheme, posed
        in a cylindrical polar co-ordinate system. Simulations are
        conducted via distributed parallel processing, performed on a
        networked cluster of workstations, employing message passing.
        Parallel performance timings are compared against those obtained
        working in sequential mode. Ideal linear speed-up with the
        number of processors is observed for viscoelastic flows under
        this coarse-grained implementation. "
}
@Article{YIwam2003,
       author   = "Y. Iwamoto and K. Suga and K. Ootsu and T. Yokota and T. Baba",
       title    = "Receiving message prediction method",
       journal  = "Parallel Computing",
       volume   = "29",
       number   = "11-12",
       pages    = "1509--1538",
       month    = NOV-DEC,
       year     = "2003",
       abstract = "This paper proposes and evaluates the Receiving Message
        Prediction Method for high performance message passing. In this
        method, a node in the idle state predicts the next message
        reception, and speculatively executes the message reception and
        user processes. This method is independent of underlying
        computer architecture and message passing libraries. We propose
        the algorithms for the message prediction, and evaluate them
        from the viewpoint of thesuccess ratio and speed-ups. We use the
        NAS parallel benchmark programs astypical parallel applications
        running on two different types of parallel platforms, i.e., a
        workstation cluster and a shared memory multiprocessor. The
        experimental results show that the method can be applied to
        various platforms. The method can also be implemented just by
        changing the software inside their message passing libraries
        without any support from the underlyingsystem software or
        hardware. This mean that we do not require any change of
        application software that uses the libraries. The application of
        the method to the message passing interface libraries achieves a
        speed-up of 6.8\% for the NAS Parallel Benchmarks, and the static
        and dynamic selection of prediction methods based on profiling
        results improve the performance. "
}
@Article{JSmit2003,
       author   = "J. Smith and A. Gounaris and P. Watson and N. W. Paton and A. A. A. Fernandes and R. Sakellariou",
       title    = "Distributed query processing on the grid",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "17",
       number   = "4",
       pages    = "353--367",
       month    = WIN,
       year     = "2003",
       abstract = "Distributed query processing (DQP) has been widely used in data
        intensive applications where data of relevance to users are
        stored at multiple locations. This paper argues: (i) that DQP
        can be important in the Grid, as a means of providing
        high-level, declarative languages for integrating data access
        and analysis; and (ii) that the Grid provides resource
        management facilities that are useful to developers of DQP
        systems. As well as discussing andillustrating how DQP
        technologies can be deployed within the Grid, the paper
        describes Polar*, a prototype implementation of a DQP system
        running over Globus. Polar* can handle complex data by adopting
        the ODMG object model and its query language OQL, which supports
        the invocation of user-defined operations. The Globus components
        are accessed through the MPICH-G interfacerather than in a lower
        level way. A case study from bioinformatics is usedthroughout
        the paper, to show the benefits of the approach."
}

@Article{RAFor2004,
       author   = "R. A. Forster and L. J. Cox and R. F. Barrett and T. E. Booth and J. F. Briesmeister and  Brown",
       title    = "{MCNP} ({TM}) {V}ersion 5",
       journal  = "Nuclear Instruments \& Methods in Physics Research Section      B-Beam Interactions with Materials and Atoms",
       volume   = "213",
       number   = "",
       pages    = "82--86",
       month    = JAN,
       year     = "2004",
       abstract = "The Monte Carlo transport workhorse, MCNP [Los Alamos National
        Laboratory report LA-13709-M, 2000], is undergoing a massive
        renovation at Los Alamos National Laboratory (LANL) in support
        of the Eolus Project of the Advanced Simulation and Computing
        (ASCI) Program. MCNP 1 Version 5 (V5) (expected to be released
        to RSICC in Fall 2002) will consist of a major restructuring
        from FORTRAN-77 (with extensions) to ANSI-standard FORTRAN90
        [American National Standard for Programming Language -
        Fortran-Extended, ANSI X3. 198-1992,1992] with support for all
        of the features available in the present release (MCNP-4C2/4C3).
        To most users, the look-and-feel of MCNP will not change much
        except for the improvements (improved graphics, easier
        installation, better online documentation). For example, even
        with the major format change, full support for incremental
        patching will still be provided. In additionto the language and
        style updates, MCNP V5 will have various new user features.
        These include improved photon physics, neutral particle
        radiography, enhancements and additions to variance reduction
        methods, new source options, improved parallelism support (PVM,
        MPI, OpenMP), and new nuclear and atomic data libraries."
}

@Article{GBron2003,
       author   = "G. Bronevetsky and D. Marques and K. Pingali and P. Stodghill",
       title    = "Automated application-level checkpointing of {MPI} programs",
       journal  = "ACM Sigplan Notices",
       volume   = "38",
       number   = "10",
       pages    = "84--94",
       month    = OCT,
       year     = "2003",
       abstract = "We then present a suitable protocol, which is implemented by a
        co-ordination layer that sits between the application program
        and the MPI library. We show how this protocol can be used with
        a precompiler that instruments C/MPIprograms to save application
        and MPI library state. An advantage of our approach is that it
        is independent of the MPI implementation. We present
        experimental results that argue that the overhead of using our
        system can be small."
}
@Article{AKarw2003,
       author   = "A. Karwande and X. Yuan and D. K. Lowenthal",
       title    = "C{C}-{MPI}: {A} compiled communication capable {MPI} prototype for ethernet    switched clusters",
       journal  = "ACM Sigplan Notices",
       volume   = "38",
       number   = "10",
       pages    = "95--106",
       month    = OCT,
       year     = "2003",
       abstract = "Compiled communication has recently been proposed to improve
        communication performance for clusters of workstations. The idea
        of compiled communication is to apply more aggressive
        optimizations to communications whose information is known at
        compile time. Existing MPI libraries do not support compiled
        communication. In this paper, we present an MPI prototype,
        CC-MPI, that supports compiled communication on Ethernet
        switched clusters. The unique feature of CC-MPI is that it
        allows the user to manage network resources such as multicast
        groups directly and to optimize communications based on the
        availability of the communication information. CC-MPI optimizes
        one-to-all,one-to-many, all-to-all, and many-to-many collective
        communication routines using the compiled communication
        technique. We describe the techniques used in CC-MPI and report
        its performance. The results show that communication performance
        of Ethernet switched clusters can be significantly improved
        through compiled communication."
}
@Article{SJDei2003,
       author   = "S. J. Deitz and B. L. Chamberlain and S. E. Choi and L. Snyder",
       title    = "The design and implementation of a parallel array operator for the    arbitrary remapping of data",
       journal  = "ACM Sigplan Notices",
       volume   = "38",
       number   = "10",
       pages    = "154--165",
       month    = OCT,
       year     = "2003",
       abstract = "Gather and scatter are data redistribution functions of
        long-standing importance to high performance computing. In this
        paper, we present a highly-general array operator with powerful
        gather and scatter capabilities unmatchedby other array
        languages. We discuss an efficient parallel
        implementation,introducing three new optimizations-schedule
        compression, dead array reuse, and direct communication-that
        reduce the costs associated with the operator's wide
        applicability. In our implementation of this operator in ZPL, we
        demonstrate performance comparable to the hand-coded Fortran +
        MPI versionsof the NAS FT and CC benchmarks."
}
@Article{SSaun2003,
       author   = "S. Saunders and L. Rauchwerger",
       title    = "A{RMI}: {A}n adaptive, platform independent communication library",
       journal  = "ACM Sigplan Notices",
       volume   = "38",
       number   = "10",
       pages    = "229--240",
       month    = OCT,
       year     = "2003",
       abstract = "ARMI is a communication library that provides a framework for
        expressing fine-grain parallelism and mapping it to a particular
        machine using shared-memory and message passing library calls.
        The library is an advanced implementation of the RMI protocol
        and handles low-level details such as schedulingincoming
        communication and aggregating outgoing communication to coarsen
        parallelism when necessary. These details can be tuned for
        different platforms to allow user codes to achieve the highest
        performance possible without manual modification. ARMI is used
        by STAPL, our generic parallel library, to provide a portable,
        user transparent communication layer, We present the basic
        design as well as the mechanisms used in the current
        Pthreads/OpenMP,MPI implementations and/or a combination
        thereof. Performance comparisons between ARMI and explicit use
        of Pthreads or MPI are given on a variety of machines, including
        an HP V2200, SGI Origin 3800, IBM Regatta-HPC and IBM RS6000 SP
        cluster."
}
@Article{GRLue2004,
       author   = "G. R. Luecke and M. Kraeva and J. Yuan and S. Spanoyannis",
       title    = "Performance and scalability of {MPI} on {PC} clusters",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "1",
       pages    = "79--107",
       month    = JAN,
       year     = "2004",
       abstract = "The purpose of this paper is to compare the communication
        performance and scalability of MPI communication routines on a
        Windows Cluster, a Linux Cluster, a Cray T3E-600, and an SGI
        Origin 2000. All tests in this paper were run using various
        numbers of processors and two message sizes. In spite of the
        fact that the Cray T3E-600 is about 7 years old, it performed
        best of all machines for most of the tests. The Linux Cluster
        with the Myrinet interconnect and Myricom's MPI performed and
        scaled quite well and, in most cases, performed better than the
        Origin 2000, and in some cases better than the T3E. The Windows
        Cluster using the Giganet Full Interconnect and MPI/Pro's MPI
        performed and scaled poorly for small messages compared with all
        of theother machines."
}
@Article{ZYina2004,
       author   = "Z. Yin and H. J. H. Clercx and D. C. Montgomery",
       title    = "An easily implemented task-based parallel scheme for the {F}ourier    pseudospectral solver applied to 2{D} {N}avier-{S}tokes turbulence",
       journal  = "Computers \& Fluids",
       volume   = "33",
       number   = "4",
       pages    = "509--520",
       month    = MAY,
       year     = "2004",
       abstract = "An efficient parallel scheme is proposed for performing direct
        numerical simulation (DNS) of two-dimensional Navier-Stokes
        turbulence at high Reynoldsnumbers. We illustrate, the resulting
        numerical code by displaying relaxation to states close to those
        that have been predicted by statistical-mechanical methods which
        start from ideal (Euler) fluid mechanics. The validationof
        these! predictions by DNS requires unusually long computation
        times on single-cpu workstations, and suggests the use of
        parallel computation. The performance of our MPI Fortran 90 code
        on the SGI Origin 3800 is reported, together with its comparison
        with another parallel method. A few computational results that
        illustrate tests of the statistical-mechanical predictionsare
        presented. "
}
@Article{LWLia42,
       author   = "L. W. Li and Y. J. Wang and E. P. Li",
       title    = "M{PI}-based parallelized precorrected {FFT} algorithm for analyzing    scattering by arbitrarily shaped three-dimensional objects",
       journal  = "Electromagnetic Waves",
       year     = "2003",
       volume   = "42",
       pages    = "247--259",
       abstract = "(none available)"
}
@Article{PThul2004,
       author   = "P. Thulasiraman and A. A. Khokhar and G. Heber and G. R. Gao",
       title    = "A fine-grain load-adaptive algorithm of the 2{D} discrete wavelet    transform for multithreaded architectures",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "64",
       number   = "1",
       pages    = "68--78",
       month    = JAN,
       year     = "2004",
       abstract = "In this paper we develop a load-adaptive multithreaded algorithm
        to compute2D Discrete Wavelet Transform (DWT) and its
        implementation on a fine-grainmultithreading platform. In a 2D
        DWT computation, the problem sizes reduces at every
        decomposition level and the length of the emerging computation
        paths also vary. The parallel algorithm proposed in this paper,
        dynamically scales itself to the varying problem size. During
        any iteration, the ratio of the number of local threads to the
        number of remote threads issued by a processor can be adjusted
        to be greater than I by controlling the algorithmparameters.
        This approach provides an opportunity to interleave computation
        and communication without explicitly introducing idle cycles on
        waiting for the remote threads to finish. Experimental results
        are reported based onthe implementations of the proposed
        algorithm on a 20 node emulated multithreaded platform,
        EARTH-MANNA, specifically designed for fine-grain multithreaded
        paradigms. We show that multithreading implementations of the
        proposed algorithm are at least 2 times faster than the
        MPI-based message passingimplementations reported in the
        literature, assuming the same processor speed. We further show
        that the proposed algorithm and implementations scale linearly
        with respect to problem and machine sizes. "
}
@Article{JDubi2004,
       author   = "J. Dubinski and J. Kim and C. Park and R. Humble",
       title    = "G{OTPM}: a parallel hybrid particle-mesh treecode",
       journal  = "New Astronomy",
       volume   = "9",
       number   = "2",
       pages    = "111--126",
       month    = FEB,
       year     = "2004",
       abstract = "We describe a parallel, cosmological N-body code based on a
        hybrid scheme using the particle-mesh (PM) and Barnes-Hut (BH)
        oct-tree algorithm. We callthe algorithm GOTPM for
        Grid-of-Oct-Trees-Particle-Mesh. The code is parallelized using
        the Message Passing Interface (MPI) library and is optimized to
        run on Beowulf clusters as well as symmetric multi-processors.
        The gravitational potential is determined on a mesh using a
        standard PM method with particle forces determined through
        interpolation. The softened PM force is corrected for short
        range interactions using a grid of localized BH trees throughout
        the entire simulation volume in a completely analogous way to
        (PM)-M-3 methods. This method makes no assumptions about the
        local density forshort range force corrections and so is
        consistent with the results of the(PM)-M-3 method in the limit
        that the treecode opening angle parameter, theta-->0. The PM
        method is parallelized using one-dimensional slice domain
        decomposition. Particles are distributed in slices of equal
        width to allow mass assignment onto mesh points. The Fourier
        transforms in the PM method are done in parallel using the MPI
        implementation of the FFTW package. Parallelization for the tree
        force corrections is achieved again using one-dimensional slices
        but the width of each slice is allowed to vary according to the
        amount of computational work required by the particles within
        each slice to achieve load balance. The tree force corrections
        dominate the computational load and so imbalances in the PM
        density assignment step do not impact the overall load balance
        and performance significantly. The code performance scales well
        to 128 processors and is significantly better than competing
        methods. We present preliminary results from simulations run on
        different platforms containing up to N=1G particles to verify
        the code."
}
@Article{SGorl2004,
       author   = "S. Gorlatch",
       title    = "Send-receive considered harmful: {M}yths and realities of message passing",
       journal  = "ACM Transactions on Programming Languages and Systems",
       volume   = "26",
       number   = "1",
       pages    = "47--56",
       month    = JAN,
       year     = "2004",
       abstract = {During the software crisis of the 1960s, Dijkstra's famous
        thesis "goto considered harmful" paved the way for structured
        programming. This short communication suggests that many current
        difficulties of parallel programming based on message passing
        are caused by poorly structured communication, whichis a
        consequence of using low-level send-receive primitives. We argue
        that, like goto in sequential programs, send-receive should be
        avoided as far as possible and replaced by collective operations
        in the setting of message passing. We dispute some widely held
        opinions about the apparent superiority of pairwise
        communication over collective communication and present
        substantial theoretical and empirical evidence to the contrary
        in the context of MPI ( Message Passing Interface).}
}
@Article{CWKes2004,
       author   = "C. W. Kessler",
       title    = "Managing distributed shared arrays in a bulk-synchronous parallel    programming environment",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "2-3",
       pages    = "133--153",
       month    = FEB-MAR,
       year     = "2004",
       abstract = "NestStep is a parallel programming language for the BSP
        (bulk-hronous parallel) programming model. In this article we
        describe the concept of distributed shared arrays in NestStep
        and its implementation on top of MPI. In particular, we present
        a novel method for runtime scheduling of irregular, direct
        remote accesses to sections of distributed shared arrays. Our
        method, which is fully parallelized, uses conventional two-sided
        message passing and thus avoids the overhead of a standard
        implementation of direct remote memory access based on one-sided
        communication. The main prerequisite is that the given program
        is structured in a BSP-compliant way. "
}
@Article{SBenk2004,
       author   = "S. Benkner and T. Brandes",
       title    = "Compiling data-parallel programs for clusters of {SMP}s",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "2-3",
       pages    = "111--132",
       month    = FEB-MAR,
       year     = "2004",
       abstract = "Clusters of shared-memory multiprocessors (SMPs) have become the
        most promising parallel computing platforms for scientific
        computing. However, SNIP clusters significantly increase the
        complexity of user application development when using the
        low-level application programming interfaces MPI and OpenMP,
        forcing users to deal with both distributed-memory and
        shared-memory parallelization details. In this paper we present
        extensions of High Performance Fortran (HPF) for SNIP clusters
        which enable the compiler to adopt a hybrid parallelization
        strategy, efficiently combining distributed-memory with
        shared-memory parallelism. By means of a small set of new
        language features, the hierarchical structure of SNIP clusters
        may be specified. This information is utilized by the compiler
        to derive inter-node data mappings for controlling
        distributed-memory parallelization across the nodes of a cluster
        and intra-node data mappings for extracting shared-memory
        parallelism within nodes. Additional mechanisms are proposed for
        specifying inter- and intra-node data mappings explicitly, for
        controlling specific shared-memory parallelization issues and
        for integrating OpenMP routines in HPF applications. The
        proposed features have been realized within the ADAPTOR and VFC
        compilers. The parallelization strategy for clusters of SMPs
        adopted by these compilers is discussed as well as a
        hybrid-parallel execution model based ona combination of MPI and
        OpenMP. Experimental results indicate the effectiveness of the
        proposed features. "
}
@Article{TRaub2004,
       author   = "T. Rauber and R. Reilein and G. Runger",
       title    = "Group-{SPMD} programming with orthogonal processor groups",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "2-3",
       pages    = "173--195",
       month    = FEB-MAR,
       year     = "2004",
       abstract = "Many programs for message-passing machines can benefit from an
        implementation in a group-SPMD programming model due to the
        potential to reduce communication overhead and to increase
        scalability. In this paper, we consider group-SPMD programs
        exploiting different orthogonal processor partitions in one
        program. For each program this is a fixed set of predefined
        processor partitions given by the parallel hyperplanes of a two-
        or multi-dimensional virtual processor organization. We
        introduce a library built on top of MPI tosupport the
        programming with those orthogonal processor groups. The parallel
        programming model is appropriate for applications with a
        multi-dimensional task grid and task dependencies mainly aligned
        in the dimensions of the task grid. The library can be used to
        specify the appropriate processor partitions, which are then
        created by the library, and to define the mapping of tasks to
        the processor hyperplanes. Examples from numerical analysis
        illustrate the programming style and show that the runtime on
        distributed memory machines can be considerably reduced by using
        the library. "
}
@Article{DJQui2004,
       author   = "D. J. Quinlan and M. Schordan and B. Miller and M. Kowarschik",
       title    = "Parallel object-oriented framework optimization",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "2-3",
       pages    = "293--302",
       month    = FEB-MAR,
       year     = "2004",
       abstract = "Sophisticated parallel languages are difficult to develop; most
        parallel distributed memory scientific applications are
        developed using a serial language, expressing parallelism
        through third party libraries (e.g. MPI). As a result,
        frameworks and libraries are often used to encapsulate
        significant complexities. We define a novel approach to optimize
        the use of libraries within applications. The resulting tool,
        named ROSE, leverages the additional semantics provided by
        library-defined abstractions enabling library specific
        optimization of application codes. It is a common perception
        that performance is inversely proportional to the level of
        abstraction. Our work shows that this is not the case if the
        additional semantics can be leveraged. We show how ROSE can be
        used to leverage the semantics within the compile-time
        optimization. "
}
@Article{PRao2004,
       author   = "P. Rao",
       title    = "A parallel hydrodynamic model for shallow water equations",
       journal  = "Applied Mathematics and Computation",
       volume   = "150",
       number   = "1",
       pages    = "291--302",
       month    = FEB,
       year     = "2004",
       abstract = "A parallel implementation of a finite difference model for
        solving two-dimensional, time-dependent, open channel flows is
        presented. The algebraic equations resulting from the finite
        difference discretization of the two dimensional shallow water
        flow equations are solved by using explicit MacCormackscheme.
        The parallel code has been implemented on distributed-shared
        memory system, by using domain decomposition techniques. The
        message passing interface (MPI) protocols are incorporated for
        inter processor data communication. The effect Of using two
        different geometry partitions is investigated.A comparison of
        the wallclock time of the code between these two partitions is
        made, and code performances with respect to different number of
        processors are presented. "
}

@Article{PFLiu2003,
       author   = "P. F. Liu and K. Li",
       title    = "Performance analysis of a {BiCGSTAB} solver for multiple-marine-propeller    simulation with several {MPI} libraries and platforms",
       journal  = "High Performance Scientific and Engineering Computing:      Hardware/Software Support",
       volume   = "750",
       number   = "",
       pages    = "63--78",
       month    = "",
       year     = "2003",
       abstract = ""
}

@Article{AJian2004,
       author   = "A. Jiang and S. Y. Shi and G. Jin and D. W. Prather",
       title    = "Performance analysis of three dimensional high index contrast    dielectric waveguides",
       journal  = "Optics Express",
       volume   = "12",
       number   = "4",
       pages    = "633--643",
       month    = FEB,
       year     = "2004",
       abstract = "This paper presents the implementation of a parallelized
        Finite-Difference Time-Domain method, based on the Message
        Passing Interface (i.e. MPI), which is used to study the modal
        properties of three-dimensional (3D) dielectric waveguide
        structures. To this end, we also use the least-square method
        toobtain the wave vector, beta, along the axis of propagation.
        Lastly, bending losses in arbitrary-angle waveguides are also
        discussed."
}
@Article{GAlte2004,
       author   = "G. Altekar and S. Dwarkadas and J. P. Huelsenbeck and F. Ronquist",
       title    = "Parallel metropolis coupled {M}arkov chain {M}onte {C}arlo for {B}ayesian    phylogenetic inference",
       journal  = "Bioinformatics",
       volume   = "20",
       number   = "3",
       pages    = "407--415",
       month    = FEB,
       year     = "2004",
       abstract = "Results: This paper presents a parallel algorithm for (MC)(3).
        The proposedparallel algorithm retains the ability to explore
        multiple peaks in the posterior distribution of trees while
        maintaining a fast execution time. The algorithm has been
        implemented using two popular parallel programming models:
        message passing and shared memory. Performance results indicate
        nearly linear speed improvement in both programming models for
        small and large datasets."
}
@Article{AQCui2004,
       author   = "A. Q. Cui and R. L. Street",
       title    = "Large-eddy simulation of coastal upwelling flow",
       journal  = "Environmental Fluid Mechanics",
       volume   = "4",
       number   = "2",
       pages    = "197--223",
       month    = JUN,
       year     = "2004",
       abstract = "Large-eddy simulations were carried out to study
        laboratory-scale realizations of coastal upwelling in an annular
        rotating tank with a sloping bottom.A two-layer stratified fluid
        was set into rigid body motion with the tank and then driven by
        the relative rotation of a solid top lid. The simulationcode
        developed in this work was a three-dimensional incompressible
        Navier-Stokes solver using the message passing interface. The
        simulation runs wereperformed on a distributed memory massively
        parallel computer, namely, theIBM SP2. The simulation results
        were able to reveal the evolution of the complex upwelling
        structures in detail. The results were used to compare with and
        to complement two relevant series of coastal upwelling
        experiments. ARayleigh-Taylor type of instability took place in
        the top inversion layer due to the unstable stratification after
        establishment of the upwelling front. The primary upwelling
        front was unstable to azimuthal perturbations anddeveloped large
        amplitude baroclinic waves. The frontal wave structure consists
        of cyclone/anticyclone pairs. Whether cyclonic eddies containing
        the lower-layer fluid pinch off from the front depends on the
        theta(*) value. The non-dimensional parameter theta(*) = g(1)
        h(0)/u(*)flambda(s), which wasfirst introduced by Narimousa and
        Maxworthy, combines the effects of stratification, rotation and
        surface stress and can be used to characterize the upwelling
        flow field. Our studies show that the frontal instabilities are
        much more intense and the upwelling front itself displays strong
        unsteadiness and cyclonic eddies containing the lower-layer
        fluid pinch off from the front when theta(*) is significantly
        less than 5.8. For theta(*) = 5.8, the frontal instabilities are
        less intense and no pinched-off process is observed. To separate
        these regimes, a critical value of theta(*) of about 5.4
        is consistent with Narimousa and Maxworthy's results."
}
@Article{BKalu2004,
       author   = "B. Kaludercic",
       title    = "Parallelisation of the {L}agrangian model in a mixed {E}ulerian-{L}agrangian    {CFD} algorithm",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "64",
       number   = "2",
       pages    = "277--284",
       month    = FEB,
       year     = "2004",
       abstract = "This manuscript presents an algorithm implemented in a
        commercial computational fluid dynamics (CFD) code for
        parallelisation of the Lagrangian particle tracking model in a
        mixed Eulerian-Lagrangian CFD algorithm. The algorithm is based
        on the domain decomposition parallelisation strategy and
        asynchronous message passing protocol. The methodology is tested
        on two industrial CFD test cases and the parallelisation results
        are presented. Further, itis discussed how the parallel
        efficiency of the runs can be improved by adopting the domain
        decomposition scattering technique. "
}
@Article{ACBur2004,
       author   = "A. C. Burt and I. B. Celik and R. S. Gemmen and A. V. Smirnov",
       title    = "A numerical study of cell-to-cell variations in a {SOFC} stack",
       journal  = "Journal of Power Sources",
       volume   = "126",
       number   = "1-2",
       pages    = "76--87",
       month    = FEB,
       year     = "2004",
       abstract = "A numerical investigation of cell-to-cell voltage variation is
        performed byconsidering the impact of flow distribution and heat
        transfer on a SOFC stack. The stack model used is based on a
        one-dimensional co-flow cell model developed in prior work. The
        influence of radiative heat transfer between the PEN (positive
        electrode, electrolyte, negative electrode body) and the
        neighboring separator plates on the temperature distribution is
        also considered. Variations in cell voltage are attributed to
        asymmetries in stack geometry (boundary effects) and
        non-uniformity in flow rates, more particularly, flow thermal
        capacity. Simulations were done in a parallel computing
        environment with each cell computed in a separate (CPU) process.
        This natural decomposition of the fuel cell stack reduced the
        number of communicated variables thereby improving computational
        performance. The parallelization scheme implemented utilized a
        message passing interface (MPI) protocol where cell-to-cell
        communication is achieved via exchange of temperature and
        thermal fluxes between neighboring cells. "
}
@Article{SCDon2004,
       author   = "S. C. Dong and G. E. Karniadakis",
       title    = "Dual-level parallelism for high-order {CFD} methods",
       journal  = "Parallel Computing",
       volume   = "30",
       number   = "1",
       pages    = "1--20",
       month    = JAN,
       year     = "2004",
       abstract = "A hybrid two-level parallel paradigm with MPI/OpenMP is
        presented in the context of high-order methods and implemented
        in the spectral/hp element framework to take advantage of the
        hierarchical structures arising from deterministic and
        stochastic CFD problems. We take a coarse grain approach to
        OpenMP shared-memory parallelization and employ a
        workload-splitting scheme that reduces the OpenMP
        synchronizations to the minimum. The hybrid algorithm shows good
        scalability with respect to both the problem size and the
        numberof processors for a fixed problem size. For the same
        number of processors,the hybrid model with 2 OpenMP threads per
        MPI process is observed to perform better than pure MPI and pure
        OpenMP on the SGI Origin 2000 and the Intel IA64 Cluster, while
        the pure MPI model performs the best on the IBM SP3 and on the
        Compaq Alpha Cluster. A key new result is that the use of
        threads facilitates effectively p-refinement, which is crucial
        to adaptive discretization using high-order methods."
}
@Article{AVGer2004,
       author   = "A. V. Gerbessiotis",
       title    = "Architecture independent parallel binomial tree option price valuations",
       journal  = "Parallel Computing",
       volume   = "30",
       number   = "2",
       pages    = "301--316",
       month    = FEB,
       year     = "2004",
       abstract = "We introduce an architecture independent approach in describing
        how computations such as those involved in American or
        European-style option price valuations can be performed in
        parallel under the binomial tree model. We describe a
        latency-tolerant parallel algorithm for the multiplicative
        binomial tree option pricing model. The algorithm is described
        and analyzed in an architecture independent setting and
        performance characteristics are expressed in terms of problem
        size n, the time horizon, and the parameters p, L andg of the
        bulk-synchronous parallel model of computation. The algorithm
        achieves optimal theoretical speedup and is within a I + o(l)
        multiplicative factor of the corresponding sequential method. An
        experimental study of an implementation of the algorithm on a
        cluster of PC workstations is also undertaken to examine the
        latency-tolerance of our approach. The implementationwith only a
        recompilation of the same source code works under two diverse
        parallel programming libraries namely, MPI and BSPlib, thus
        making it not only architecture but also communication library
        independent. "
}
@Article{VBlan2004,
       author   = "V. Blanco and P. Gonzalez and J. C. Cabaleiro and D. B. Heras and T. F. Pena and  Pombo",
       title    = "Performance prediction for parallel iterative solvers",
       journal  = "Journal of Supercomputing",
       volume   = "28",
       number   = "2",
       pages    = "177--191",
       month    = MAY,
       year     = "2004",
       abstract = "In this paper, an exhaustive parallel library of sparse
        iterative methods and preconditioners in HPF and MPI was
        developed, and a model for predictingthe performance of these
        codes is presented. This model can be used both by users and by
        library developers to optimize the efficiency of the codes, as
        well as to simplify their use. The information offered by this
        model combines theoretical features of the methods and
        preconditioners in addition to certain practical considerations
        and predictions about aspects of the performance of their
        execution in distributed memory multiprocessors."
}

@Article{SSVad2004,
       author   = "S. S. Vadhiyar and G. E. Fagg and J. J. Dongarra",
       title    = "Towards an accurate model for collective communications",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "18",
       number   = "1",
       pages    = "159--167",
       month    = SPR,
       year     = "2004",
       abstract = "The performance of the MPI's collective communications is
        critical in most MPI-based applications. A general algorithm for
        a given collective communication operation may not give good
        performance on all systems due to the differences in
        architectures, network parameters and the storage capacity of
        the underlying MPI implementation. Hence, collective
        communications have to be tuned for the system on which they
        will be executed. In order to determine the optimum parameters
        of collective communications on a given system ina
        time-efficient manner, the collective communications need to be
        modeled efficiently. In this paper, we discuss various
        techniques for modeling collective communications."
}

@Article{PAmic2004,
       author   = "P. Amico and L. Bosi and C. Cattuto and L. Gammaitoni and M. Punturo and  Travasso",
       title    = "A computational test facility for distributed analysis of gravitational    wave signals",
       journal  = "Classical and Quantum Gravity",
       volume   = "21",
       number   = "5",
       pages    = "S847-S851",
       month    = MAR,
       year     = "2004",
       abstract = "In the gravitational wave detector Virgo, the in-time detection
        of a gravitational wave signal from a coalescing binary stellar
        system is an intensivecomputational task. A parallel computing
        scheme using the message passing interface (MPI) is described.
        Performance results on a small-scale cluster are reported."
}

@Article{FAcer2004,
       author   = "F. Acernese and F. Barone and R. De Rosa and A. Eleuteri and L. Milano and  Pardi",
       title    = "A multi-standard farm prototype for gravitational wave signal analysis",
       journal  = "Classical and Quantum Gravity",
       volume   = "21",
       number   = "5",
       pages    = "S837-S842",
       month    = MAR,
       year     = "2004",
       abstract = "We implemented in Napoli a new general purpose farm prototype
        for the development and testing of gravitational wave data
        analysis algorithms. Its mainfeature is that it allows the users
        to dynamically change its configuration according to the data
        analysis tests. In fact, the farm is fully
        remotelyreconfigurable in-time as an MPI farm, a MOSIX farm or a
        GRID, configurations that may also coexist as independent
        subsets. Furthermore, the farm uses only standard hardware and
        software, guaranteeing easy upgrades and direct integration with
        other farms. In this paper we will describe this facility and
        further developments."
}

@Article{STomo2004,
       author   = "S. Tomov and R. Bennett and M. McGuigan and A. Peskin and G. Smith and J. Spiletic",
       title    = "Application of interactive parallel visualization for commodity-based    clusters using visualization {API}s",
       journal  = "Computers \& Graphics-Uk",
       volume   = "28",
       number   = "2",
       pages    = "273--278",
       month    = APR,
       year     = "2004",
       abstract = {We present an efficient and inexpensive to develop application
        for interactive high-performance parallel visualization. We
        extend popular APIs such asOpen Inventor and VTK to support
        commodity-based cluster visualization. Our implementation
        follows a standard master/slave concept: the general idea is to
        have a "Master" node, which will intercept a sequential
        graphical user interface and broadcast it to the "Slave" nodes.
        The interactions betweenthe nodes are implemented using MPI The
        parallel remote rendering uses Chromium. This paper is mainly
        the report of our implementation experiences. We present in
        detail the proposed model and key aspects of its implementation.
        Also, we present performance measurements, we benchmark and
        quantitatively demonstrate the dependence of the visualization
        speed on the data size and the network bandwidth, and we
        identify the singularities and draw conclusions on Chromium's
        sort-first rendering architecture. The most original part of
        this work is the combined use of Open Inventor and Chromium.}
}
@Article{ARMRa2004,
       author   = "A. R. M. Rao and T. V. S. R. A. Rao and B. Dattaguru",
       title    = "Comparative efficiencies of three parallel algorithms for nonlinear    implicit transient dynamic analysis",
       journal  = "Sadhana-Academy Proceedings in Engineering Sciences",
       volume   = "29",
       number   = "pt. 1",
       pages    = "57--81",
       month    = FEB,
       year     = "2004",
       abstract = "The work reported in this paper is motivated by the need to
        develop portable parallel processing algorithms and codes which
        can run on a variety of hardware platforms without any
        modifications. The prime aim of the research work reported here
        is to test the portability of the parallel algorithms andalso to
        study and understand the comparative efficiencies of three
        parallel algorithms developed for implicit time integration
        technique. The standard message passing interface (MPI) is used
        to develop parallel algorithms for computing nonlinear dynamic
        response of large structures employing implicit time-marching
        scheme. The parallel algorithms presented in this paper are
        developed under the broad framework of non-overlapped domain
        decomposition technique. Numerical studies indicate that the
        parallel algorithm devisedemploying the conventional form of
        Newmark time integration algorithm is faster than the
        predictor-corrector form. It is also accurate and highly
        adaptive to fine grain computations. The group. implicit
        algorithm is found tobe extremely superior in performance when
        compared to the other two parallel algorithms. This algorithm is
        better suited for large size problems on coarse grain
        environment as the resulting submeshes will obviously be large
        and thus permit larger time steps without losing accuracy."
}

 

@Article{VChau2004,
       author   = "V. Chaudhary and W. L. Hase and H. Jiang and L. Sun and D. Thaker",
       title    = "Experiments with parallelizing tribology simulations",
       journal  = "Journal of Supercomputing",
       volume   = "28",
       number   = "3",
       pages    = "323--343",
       month    = JUN,
       year     = "2004",
       abstract = "Different parallelization methods vary in their system
        requirements, programming styles, efficiency of exploring
        parallelism, and the application characteristics they can
        handle. For different situations, they can exhibit totally
        different performance gains. This paper compares OpenMP, MPI,
        and Strings for parallelizing a complicated tribology problem.
        The problem size andcomputing infrastructure is changed to
        assess the impact of this on various parallelization methods.
        All of them exhibit good performance improvements and it
        exhibits the necessity and importance of applying
        parallelization in this field."
}
@Article{MMuri2004,
       author   = "M. Murillo and X. C. Cai",
       title    = "A fully implicit parallel algorithm for simulating the non-linear    electrical activity of the heart",
       journal  = "Numerical Linear Algebra with Applications",
       volume   = "11",
       number   = "2-3",
       pages    = "261--277",
       month    = MAR-APR,
       year     = "2004",
       abstract = "In this paper, we study a fully implicit parallel
        Newton-Krylov-Schwarz method (NKS) for solving the bidomain
        equations describing the electrical excitation process of the
        heart. NKS has been used successfully for many non-linear
        problems, but this is the first attempt to use this method for
        the bidomain model which consists of a system of time dependent
        partial differential equations of mixed type. Our experiments on
        parallel computers show thatthe method is scalable and robust
        with respect to many of the parameters in the bidomain model. In
        the outer layer of the algorithm, we use a nonlinearly implicit
        backward Euler method to discretize the time derivative, and the
        resulting systems of large sparse non-linear equations are
        solved usingan inexact Newton method. The Jacobian system
        required to solve in each Newton iteration is solved with a
        GMRES method preconditioned by a new component-wise restricted
        additive Schwarz preconditioner. The efficiency and robustness
        of the overall method depend heavily on what preconditioner we
        use.By comparing several preconditioners, we found our new
        restricted additiveSchwarz method offers the best performance.
        Our parallel software is developed using the PETSc package of
        Argonne National Laboratory. Numerical results obtained on an
        IBM SP will be reported. "
}
@Article{EPLia2004,
       author   = "E. P. Li and H. F. Jin and S. Wang and L. W. Li",
       title    = "Signal propagation in high speed differential transmission line using    parallelized finite-difference time-domain method",
       journal  = "Journal of Electromagnetic Waves and Applications",
       volume   = "18",
       number   = "4",
       pages    = "437--454",
       month    = "",
       year     = "2004",
       abstract = "This paper presents the investigations in the electric
        performance of differential signalling transmission lines used
        for high speed integrated circuits (IC's) and boards by using
        the parallelized finite-difference time-domain (FDTD) method.
        The FDTD method is firstly parallelized with single-program
        multiple-data (SPAMD) architecture using the MPI protocol and
        experimentally validated. The key electrical factors, crosstalk,
        impedance of highspeed differential transmission lines, are
        simulated and investigated for various configuration using the
        developed parallelized FDTD code. The discussions presented in
        this paper shall be used a guideline for engineers to optimize
        high-speed circuit designs with differential signaling
        transmission lines for signal integrity (SI) and electromagnetic
        compatibility (EMC)."
}

@InProceedings{shir99:mpi-analysis,
  author = 	 {Dale Shires and Lori Pollock and Sara Sprenkle},
  title  = {Program Flow Graph Construction for Static Analysis of {MPI} Programs},
  booktitle =	 {Proceedings of the conference on Parallel and Distributed Processing Techniques and Applications},
  pages =	 {1847-1853},
  year =	 1999,
  month =	 JUN,
  abstract = {The Message Passing Interface MPI has been widely used
                  to develop e cient andportable parallel programs for
                  distributed memorymultiprocessors and workstation PC
                  clusters Inthis paper we present an algorithm for
                  buildinga program ow graph representation of an
                  MPIprogram As an extension of the control owgraph
                  representation of sequential codes this
                  representation provides a basis for important
                  program analyses useful in software testing,
                  debugggin, and code optimization}
}

@TechReport{sie03:mpi-analysis,
  author = 	 {Stephen F. Siegel and George Avrunin},
  title = 	 {Analysis of MPI Programs},
  institution =  {Department of Computer Science, University of Massachusetts Amherst},
  year = 	 2003,
  number =	 {UM-CS-2003-036},
  abstract = {We investigate the application of formal verification
                  techniques to parallel programs that employ the
                  Message Passing Interface (MPI). We develop a formal
                  model of a subset of MPI, and then prove a number of
                  theorems about that model that ameliorate or
                  eliminate altogether the state explosion problem. As
                  an example, we show that if one wishes to verify
                  freedom from deadlock, it suffices to consider only
                  synchronous executions.},
}

@InProceedings{Stellner96,
  author =       "G. Stellner",
  title =        "CoCheck: Checkpointing and Process Migration for
                 {MPI}",
  booktitle =    "Proc. 10th Int. Parallel Processing Symp. (IPPS'96)
                 CD-ROM",
  publisher =    "IEEE",
  address =      "Honolulu, HA",
  month =        apr,
  year =         "1996",
  keywords =     "Synchronization, Virtual Memory, and Runtime System,",
}

@InProceedings{bon03:mpi-other,
  author = 	 {Dan Bonachea and Jason Duell},
  title = 	 {Problems with using {MPI} 1.1 and 2.0 as compilation
                  targets for parallel language implementations},
  booktitle =	 {2nd Workshop on Hardware/Software Support for High
                  Performance Scientific and Engineering Computing,
                  SHPSEC-PACT03}, 
  year =	 2003,
  note =	 {Also to appear in the International Journal on High Performance Computing and Networking (IJHPCN)}
}

@Article{DStan2004,
       author   = "D. Stanescu and D. Ait-Ali-Yahia and W. G. Habashi and M. P. Robichaud",
       title    = "Spectral element method for linear fan tone noise radiation",
       journal  = "AIAA Journal",
       volume   = "42",
       number   = "4",
       pages    = "696--705",
       month    = APR,
       year     = "2004",
       abstract = "A numerical method for prediction of acoustic spinning mode
        radiation from turbofan inlets is presented. Sound propagation
        is modeled by the linearized mass conservation equation for
        irrotational flows and solved in the frequency domain. The mean
        flow through the inlet is obtained as a solution of the full
        potential equation. Both the mean-flow and the acoustic problem
        areapproximated by Galerkin projection in spectral element
        spaces of continuous piecewise polynomials defined on the same
        grid. The Gauss-Chebyshev-Lobatto points within the elements are
        generated via transfinite interpolation and CAP projection
        procedures embedded within the code. The linear algebraic
        systems obtained are then solved using either direct or sparse
        iterative solvers based on the message passing interface
        standard for interprocessor communication. The singularity
        appearing in the acoustic integrals on the symmetry axis is
        treated by the use of a collocation operator based on the
        Gauss-Chebyshev, instead of the Gauss-Chebyshev-Lobatto, points.
        To eliminate reflections from the radiation boundaries, a novel
        frequency-domain formulation of the matched-layer technique,
        wherein waves entering the layer areexponentially damped, is
        proposed. The overall computing procedure is first validated on
        a tone radiation problem from a semi-infinite cylinder and then
        applied to an experimental JT15D turbofan inlet setup."
}

@Article{DIron2004,
       author   = "D. Irony and G. Shklarski and S. Toledo",
       title    = "Parallel and fully recursive multifrontal sparse {C}holesky",
       journal  = "Future Generation Computer Systems",
       volume   = "20",
       number   = "3",
       pages    = "425--440",
       month    = APR,
       year     = "2004",
       abstract = "We describe the design, implementation, and performance of a new
        parallel sparse Cholesky factorization code. The code uses a
        multifrontal factorization strategy. Operations on small dense
        submatrices are performed using new dense matrix subroutines
        that are part of the code, although the code can also use the
        BLAs and LAPACK. The new code is recursive at both the sparse
        and the dense levels, it uses a novel recursive data layout for
        dense submatrices, and it is parallelized using Cilk, an
        extension of C specifically designed to parallelize recursive
        codes. We demonstrate that the new code performs well and scales
        well on SMPs. In particular, on up to 16 processors, the code
        outperforms two state-of-the-art message-passing codes. The
        scalability and high performance that the code achieves imply
        that recursive schedules, blocked data layouts, and dynamic
        scheduling are effective in the implementation of sparse
        factorization codes."
}

@Article{FGarc2004,
       author   = "F. Garcia-Carballeira and J. Carretero and A. Calderon and J. M. Perez and J. D. Garcia",
       title    = "An adaptive cache coherence protocol specification for parallel    input/output systems",
       journal  = "IEEE Transactions on Parallel and Distributed      Systems",
       volume   = "15",
       number   = "6",
       pages    = "533--545",
       month    = JUN,
       year     = "2004",
       abstract = "Caching has been intensively used in memory and traditional file
        systems toimprove system performance. However, the use of
        caching in parallel file systems and I/O libraries has been
        limited to I/O nodes to avoid cache coherence problems. In this
        paper, we specify an adaptive cache coherence protocol very
        suitable for parallel file systems and parallel I/O libraries.
        Thismodel exploits the use of caching, both at processing and
        I/O nodes, providing performance increase mechanisms as
        aggressive prefetching and delayed-write techniques. The cache
        coherence problem is solved by using a dynamic scheme of cache
        coherence protocols with different sizes and shapes of
        granularity. The proposed model is very appropriate for parallel
        I/O interfaces, as MPI-IO. Performance results, obtained on an
        IBM SP2, are presented to demonstrate the advantages offered by
        the cache management methods proposed."
}

@Article{CBoer2004,
       author   = "C. Boeres and V. E. F. Rebello",
       title    = "Easy{G}rid: towards a framework for the automatic {G}rid enabling of legacy    {MPI}applications",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "5",
       pages    = "425--432",
       month    = APR,
       year     = "2004",
       abstract = "One of the goals of the Grid is to aggregate collections of
        shared, heterogeneous, and distributed resources to provide
        computational 'power' to parallel applications. However,
        designing applications capable of exploiting this potential with
        ease remains a challenge. This paper outlines the
        EasyGridmethodology for the efficient and robust execution of
        (legacy) MPI programs across distributed computing clusters. The
        principal objective of this work is to identify the
        application-oriented middleware necessary for, as well as to
        develop a framework to automatically generate, system-aware
        applications capable of executing in dynamic, unstable,
        distributed environments such as computational Grids. "
}

@Article{BButr2004,
       author   = "B. Butrylo and C. Vollaire and L. Nicolas and A. Nicolas",
       title    = "Numerical performance of the distributed vector finite-element    time-domain algorithm",
       journal  = "IEEE Transactions on Magnetics",
       volume   = "40",
       number   = "2",
       pages    = "997--1000",
       month    = MAR,
       year     = "2004",
       abstract = "This paper deals with a distributed time-domain modeling of
        electromagneticphenomena with the finite-element method. The
        model is approximated by edge elements. The constitutive
        equations and method of parallelization of thealgorithm are
        presented. The properties -of the distributed
        finite-elementtime-domain algorithm are discussed. Some typical
        performance metrics are studied for the parallel versions, of
        the software. The presented algorithmis executed on a
        heterogeneous and a homogeneous clusters of workstations.Two
        different distributed memory environments (MPI and PVM) are used
        to evaluate the efficiency of the algorithm."
}

@Article{THana2004,
       author   = "T. Hanawa and S. Ikuno and A. Kamitani",
       title    = "Application of parallelized multigrid method to solution of {MHD}    equilibrium with {MPI}",
       journal  = "IEEE Transactions on Magnetics",
       volume   = "40",
       number   = "2",
       pages    = "1005--1008",
       month    = MAR,
       year     = "2004",
       abstract = "The potential of applying the multigrid method (MGM) to
        magnetohydrodynamics (MHD) equilibrium analysis is investigated.
        The nonlinear eigenvalue problem often appears when the MHD
        equilibria are determined by solving the Grad-Shafranov equation
        numerically. After linearizing the equation, the problem is
        solved using an iterative procedure. Although the SOR method or
        the Gauss-Seidel method is often used for the solution of the
        linearized equation, it takes much CPU time to solve the
        problem. We introduced the use of MGM instead of the
        conventional method for solving the linear equation. The
        parallel processing by using message passing interface (MPI) on
        a PC clusteris adopted for implementation of the MGM to achieve
        higher performance."
}

@Article{KTagu2004,
       author   = "K. Taguchi and M. Uchiya and T. Kashiwa and K. Hirayama and H. Kuribayashi and S. Komatsu",
       title    = "F{DTD} large-scale parallel supercomputing and its application to the    analysis of radiation characteristics of an antenna mounted on a vehicle",
       journal  = "International Journal of Rf and Microwave Computer-Aided      Engineering",
       volume   = "14",
       number   = "3",
       pages    = "253--261",
       month    = MAY,
       year     = "2004",
       abstract = "Given the remarkable advances in supercomputers, large-scale
        electromagnetic-field analyses are becoming possible by FDTD
        parallel computation. In this study, a versatile FDTD parallel
        computation system is developed by usingFortran90 and the MPI
        library. The system uses dynamic memory allocation,which
        provides a more versatile system and more efficient use of
        memory. Using the system, we analyze, for the first time, the
        radiation characteristics of an antenna mounted on a realistic
        vehicle model."
}

@Article{YMizu2004,
       author   = "Y. Mizutani and F. Ino and K. Hagihara",
       title    = "Evaluation of performance prediction method for master/slave parallel    programs",
       journal  = "Ieice Transactions on Information and Systems",
       volume   = "E87D",
       number   = "4",
       pages    = "967--975",
       month    = APR,
       year     = "2004",
       abstract = "This paper describes the design and implementation of a testbed
        for predicting master/slave (M/S) programs written using Message
        Passing Inter-face (MPI) programs. The testbed, named M/S
        Emulator (MSE), aims at assisting developers in evaluating the
        performance of M/S programs and dynamic load-balancing
        strategies on clusters of PCs. In order to realize this. MSE
        predicts the communication time by using a realistic parallel
        computational model, an extension of the LogGPS model. This
        extended model improves the prediction accuracy on a large
        number of processors. because it captures the master's
        bottleneck: the overhead required for retrieving arrival
        messages from the slaves. Current MSE also employs a best effort
        emulation method for predicting the calculation time. In our
        experiments, MSE demonstrated an accurate prediction on
        clusters, especially on a larger number of nodes. Therefore, we
        believe that our extended model enables us to analyze the
        scalability of the M/S program performance."
}

@Article{JMOrd2004,
       author   = "J. M. Orduna and F. Silla and J. Duato",
       title    = "On the development of a communication-aware task mapping technique",
       journal  = "Journal of Systems Architecture",
       volume   = "50",
       number   = "4",
       pages    = "207--220",
       month    = MAR,
       year     = "2004",
       abstract = "In this paper, we propose a communication-aware mapping
        technique that tries to match as well as possible the existing
        network resources to the communication requirements of the
        applications running on the system. Also, we evaluate the
        mapping technique using real MPI application traces with
        timestamps. Evaluation results show that the use of the proposed
        mapping techniquebetter exploits the available network
        bandwidth, improving load balancing and increasing the
        throughput that can be delivered by the network. Therefore, the
        proposed technique can be used in the design of
        communication-awarescheduling strategies for those situations
        where the communication requirements lead the network bandwidth
        to become the system performance bottleneck. "
}

@Article{YKSuh2004,
       author   = "Y. K. Suh and J. H. Park and S. K. Kim and Y. R. Son",
       title    = "Study on the periodic flows in a rectangular container under a    background rotation",
       journal  = "Ksme International Journal",
       volume   = "18",
       number   = "4",
       pages    = "671--680",
       month    = APR,
       year     = "2004",
       abstract = "We present numerical and experimental results of the periodic
        flows inside a rectangular container under a background
        rotation. In numerical computation, a parallel-computation
        technique with MPI is implemented. Flow visualization and PIV
        measurement are also performed to obtain velocity fields at the
        free surface. Through a series of numerical and experimental
        works, we aim to clarify the fundamental reasons of discrepancy
        between the two-dimensional computation and the experimental
        measurement, which was detected in the previous study for the
        same flow model. Specifically, we check if the various
        assumptions prerequisite for the validity of the classical Ekman
        pumping law are satisfied for periodic flows under a background
        rotation."
}

@Article{MYHaa2004,
       author   = "M. Y. Ha and J. G. Kim",
       title    = "Numerical simulation of natural convection in annuli with internal fins",
       journal  = "Ksme International Journal",
       volume   = "18",
       number   = "4",
       pages    = "718--730",
       month    = APR,
       year     = "2004",
       abstract = "The solution for the natural convection in internally finned
        horizontal annuli is obtained by using a numerical simulation of
        time-dependent and two-dimensional governing equations. The fins
        existing in annuli influence the flow pattern, temperature
        distribution and heat transfer rate. The variations of the fin
        configuration suppress or accelerate the free convective effects
        compared to those of the smooth tubes. The effects of fin
        configuration,number of fins and ratio of annulus gap width to
        the inner cylinder radiuson the fluid flow and heat transfer in
        annuli are demonstrated by the distribution of the velocity
        vector, isotherms and streamlines. The governing equations are
        solved efficiently by using a parallel implementation. The
        technique is adopted for reduction of the computation cost. The
        parallelization is performed with the domain decomposition
        technique and message passing between sub-domains on the basis
        of the MPI library. The results from parallel computation reveal
        in consistency with those of the sequential program.Moreover,
        the speed-up ratio shows linearity with the number of processor."
}

@Article{JEbed2004,
       author   = "J. Ebedes and A. Datta",
       title    = "Multiple sequence alignment in parallel on a workstation cluster",
       journal  = "Bioinformatics",
       volume   = "20",
       number   = "7",
       pages    = "1193--1195",
       month    = MAY,
       year     = "2004",
       abstract = "Multiple sequence alignment is the NP-hard problem of aligning
        three or more DNA or amino acid sequences in an optimal way so
        as to match as many characters as possible from the set of
        sequences. The popular sequence alignment program ClustalW uses
        the classical method of approximating a sequence alignment, by
        first computing a distance matrix and then constructing a
        guidetree to show the evolutionary relationship of the
        sequences. We show that parallelizing the ClustalW algorithm can
        result in significant speedup. We used a cluster of workstations
        using C and message passing interface for our implementation.
        Experimental results show that speedup of over 5.5 on
        sixprocessors is obtainable for most inputs."
}

@Article{FVThe2004,
       author   = "F. V. Theos and I. E. Lagaris and D. G. Papageorgiou",
       title    = "P{ANMIN}: sequential and parallel global optimization procedures with a    variety of options for the local search strategy",
       journal  = "Computer Physics Communications",
       volume   = "159",
       number   = "1",
       pages    = "63--69",
       month    = MAY,
       year     = "2004",
       abstract = "Title of program: PANMIN Catalogue identifier: ADSU Program
        summary URL: http://cpc.cs.qub.ac.uk/summaries/ADSU Program
        obtainable from: CPC Program Library, Queen's University of
        Belfast, N. Ireland Computer for which the program is designed
        and others on which it has been tested: PANMIN is designed for
        UNIX machines. The parallel code runs on either shared memory
        architectures or on a distributed system. The code has been
        tested on a SUN Microsystems ENTERPRISE 450 with four CPUs, and
        on a 48-node cluster under Linux,with both the GNU g77 and the
        Portland group compilers. The parallel implementation is based
        on MPI and has been tested with LAM MPI and MPICH Installation.
        University of Ioannina, Greece Programming language used:
        Fortran-77 Memory required to execute with typical data:
        Approximately O(n(2)) words, where n is the number of variables
        No. of bits in a word: 64 No. of processors used: 1 or many Has
        the code been vectorised or parallelized?: Parallelized using
        MPI No. of bytes in distributed program, including test data,
        etc.: 147163 No. of lines in distributed program, including the
        test data, etc.: 14366 Distribution format: gzipped tar file."
}

@Article{JPein2004,
       author   = "J. Peinado and A. M. Vidal",
       title    = "A parallel {B}royden approach to the {T}oeplitz inverse eigenproblem",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "6",
       pages    = "587--610",
       month    = MAY,
       year     = "2004",
       abstract = "In this work we show a portable sequential and a portable
        parallel algorithm for solving the inverse eigenproblem for real
        symmetric Toeplitz matrices. Both algorithms are based on
        Broyden's method for solving nonlinear systems. We reduced the
        computational cost for some problem sizes, and furthermore we
        managed to reduce spatial cost considerably, compared in both
        cases with parallel algorithms proposed by other authors and by
        us, although sometimes quasi-Newton methods (as Broyden) do not
        reach convergence in all the test cases. We have implemented the
        parallel algorithm using the parallel numerical linear algebra
        library SCALAPACK based on the MPI environment. Experimental
        results have been obtained using two different architectures: a
        shared memory multiprocessor, the SGI PowerChallenge, and a
        cluster of Pentium II PCs connected through a myrinet network.
        The algorithms obtained are scalable in all the cases. "
}

@Article{MCole2004,
       author   = "M. Cole",
       title    = "Bringing skeletons out of the closet: a pragmatic manifesto for    skeletal parallel programming",
       journal  = "Parallel Computing",
       volume   = "30",
       number   = "3",
       pages    = "389--406",
       month    = MAR,
       year     = "2004",
       abstract = "Skeleton and pattern based parallel programming promise
        significant benefits but remain absent from mainstream practice.
        We consider why this situation has arisen and propose a number
        of design principles which may help to redress it. We sketch the
        eSkel library, which represents a concrete attempt to apply
        these principles. eSkel is based on C and MPI, thereby embedding
        its skeletons in a conceptually familiar framework. We presert
        an application of eSkel and analyse it as a response to our
        manifesto. "
}

@Article{BButr2004,
       author   = "B. Butrylo and F. Musy and L. Nicolas and
                  R. Perrussel and R. Scorretti and C. Vollaire", 
       title    = "A survey of parallel solvers for the finite element
                  method in    computational electromagnetics", 
       journal  = "Compel-the International Journal for Computation
                  and      Mathematics in Electrical and Electronic
                  Engineering", 
       volume   = "23",
       number   = "2",
       pages    = "531--546",
       month    = "",
       year     = "2004",
       abstract = "This paper presents new trends in parallel methods
                  used to solve 
        finite element matrix systems: standard iterative and direct
        solving methods first, and then domain decomposition methods.
        For example, the current status and properties of two prevailing
        programming environments (PVM and MPI) are finally given and
        compared when implemented together with a finite element
        timedomain formulation."
}
@Article{SCaho2004,
       author   = "S. Cahon and N. Melab and E. G. Talbi",
       title    = "{ParadisEO}: {A} framework for the reusable design
                  of parallel and    distributed metaheuristics", 
       journal  = "Journal of Heuristics",
       volume   = "10",
       number   = "3",
       pages    = "357--380",
       month    = MAY,
       year     = "2004",
       abstract = "In this paper, we present the ParadisEO white-box
        object-oriented frameworkdedicated to the reusable design of
        parallel and distributed metaheuristics (PDM). ParadisEO
        provides a broad range of features including evolutionary
        algorithms (EA), local searches (LS), the most common parallel
        and distributed models and hybridization mechanisms, etc. This
        high content and utility encourages its use at European level.
        ParadisEO is based on a clear conceptual separation of the
        solution methods from the problems they are intended to solve.
        This separation confers to the user a maximum code and design
        reuse. Furthermore, the fine-grained nature of the classes
        provided by the framework allow a higher flexibility compared to
        other frameworks. ParadisEO is of the rare frameworks that
        provide the most common parallel and distributed models. Their
        implementation is portable on distributed-memory machines as
        well as on shared-memory multiprocessors, as it uses standard
        libraries such as MPI, PVM and PThreads. The models can be
        exploited in a transparent way, one has just to instantiate
        their associated provided classes. Their experimentation on the
        radio network design real-world application demonstrate their
        efficiency."
}

@Article{YJWan2004,
       author   = "Y. J. Wang and X. C. Nie and L. W. Li and E. P. Li",
       title    = "A parallel analysis of the scattering from inhomogeneous dielectric    bodies by the volume integral equation and the precorrected-{FFT}    algorithm",
       journal  = "Microwave and Optical Technology Letters",
       volume   = "42",
       number   = "1",
       pages    = "77--79",
       month    = JUL,
       year     = "2004",
       abstract = "In this paper, a parallel implementation of the precorrected
        fast Fourier transform (FFT) algorithm is presented to
        efficiently solve the volume-integral equation for scattering
        from inhomogeneous dielectric objects. Several examples are
        given to demonstrate the efficiency and correctness of the
        message-passing interface (MPI)-based parallelization algorithm."
}

@Article{UTrem2004,
       author   = "U. Tremel and F. Deister and O. Hassan and N. P. Weatherill",
       title    = "Automatic unstructured surface mesh generation for complex    configurations",
       journal  = "International Journal for Numerical Methods in Fluids",
       volume   = "45",
       number   = "4",
       pages    = "341--364",
       month    = JUN,
       year     = "2004",
       abstract = "In this paper a new object-oriented (OO) approach is presented
        for automatic parallel advancing front based surface mesh
        generation and adaptive remeshing for complex configurations.
        Based on the ST++-system the advantages ofthe OO design and
        implementation compared to the traditional structural approach
        are described. Algorithmic enhancements to the advancing front
        method are explained, enabling a robust NURBS based
        triangulation process directly on B-rep CAD data. The message
        passing (MPI) parallelization strategy together with the
        achievable performance improvements are demonstrated. With the
        outlined parallel geometry analysis/rasterization a powerful
        method is described to derive automatically a well suited mesh
        size specification without any user-interaction from scratch.
        The application of this method to acomplex 'real world' example
        finishes this paper. "
}
@Article{JXLiu2004,
       author   = "J. X. Liu and J. S. Wu and D. K. Panda",
       title    = "High performance {RDMA}-based {MPI} implementation over {I}nfini{B}and",
       journal  = "International Journal of Parallel Programming",
       volume   = "32",
       number   = "3",
       pages    = "167--198",
       month    = MAY,
       year     = "2004",
       abstract = "Although InfiniBand Architecture is relatively new in the high
        performance computing area, it offers many features which help
        us to improve the performance of communication subsystems. One
        of these features is Remote Direct Memory Access (RDMA)
        operations. In this paper, we propose a new design of MPI over
        InfiniBand which brings the benefit of RDMA to not only large
        messages, but also small and control messages. We also achieve
        better scalability by exploiting application communication
        pattern and combining send/receive operations with RDMA
        operations. Our RDMA-based MPI implementation achieves a latency
        of 6.8 musec for small messages and a peak bandwidth of 871
        million bytes/sec. Performance evaluation shows that for small
        messages, our RDMA-based design can reduce the latency by 24\%,
        increase the bandwidth by over 104\%, and reduce the host
        overhead by up to 22\% compared with the original design. For
        large data transfers, we improve performance by reducing the
        time for transferring control messages. We have also shown that
        our new design is beneficial to MPI collective communication and
        NAS Parallel Benchmarks."
}
@Article{ISaso2004,
       author   = "I. Sason and R. Urbanke",
       title    = "Complexity versus performance of capacity-achieving irregular    repeat-accumulate codes on the binary erasure channel",
       journal  = "IEEE Transactions on Information Theory",
       volume   = "50",
       number   = "6",
       pages    = "1247--1256",
       month    = JUN,
       year     = "2004",
       abstract = "We derive upper and lower bounds on the encoding and decoding
        complexity oftwo capacity-achieving ensembles of irregular
        repeat-accumulate (IRA1 and IRA2) codes on the binary erasure
        channel (BEC). These bounds are expressedin terms of the gap
        between the channel capacity and the rate of a typicalcode from
        this ensemble for which reliable communications is achievable
        under message-passing iterative (MPI) decoding. The complexity.
        of the ensemble of IRA1 codes grows like the negative logarithm
        of the gap to capacity. On the other hand, the complexity of the
        ensemble of IRA2 codes with any choice of the degree
        distribution grows at least like the inverse square rootof the
        gap to capacity, and at most like the inverse of the gap to
        capacity."
}
@Article{ATezu2004,
       author   = "A. Tezuka and J. Matsumoto and K. Matsubara",
       title    = "Development of common software platform on parallel computations for    discretized numerical schemes and its application to finite element    fluid dynamics",
       journal  = "International Journal of Computational Fluid Dynamics",
       volume   = "18",
       number   = "4",
       pages    = "347--354",
       month    = MAY,
       year     = "2004",
       abstract = "It is a time consuming and very skillful task for researchers or
        developerson computational mechanics to modify a program for a
        single processor to the one for parallel computation. This is a
        serious bottleneck for parallel computation, even though
        general-purpose parallel computational library such as MPI is
        applied in his modification. We developed a parallel matrix
        solver platform, called 'Parallel Computing Platform/PCP, based
        on domain de composition scheme for various numerical schemes
        such as finite element method (FEM), finite difference method
        (FDM) and finite volume method (FVM) to accelerate a smooth
        shift to parallel computational world. Some parallel software
        such as PETSc, Aztec, GEOFEM and ADVENTURE had been developed,
        however, these are for professionals in parallel computations
        and not valid for our purpose. In our platform, what a user
        should do is just to call the platform at the stage of stiffness
        matrix calculation. GMRES and Bi-CGSTAB with some
        pre-conditioners are used as a basic matrix solver. The option
        of Lagrange-multiplier is also attached. For the partitioning, a
        fast graph generator for arbitrary elements and the interface
        with MeTis are equipped. Ourplatform is valid for a variety of
        hardware, including single processor based WS, by exchanging
        Makefile.in. The effectiveness of our platform is evaluated with
        several examples in finite element fluid dynamics."
}
@Article{DJKer2004,
       author   = "D. J. Kerbyson and A. Hoisie and S. Pakin and F. Petrini and H. J. Wasserman",
       title    = "A performance evaluation of an alpha {EV}7 processing node",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "18",
       number   = "2",
       pages    = "199--209",
       month    = SUM,
       year     = "2004",
       abstract = "In this paper we detail the performance of a new AlphaServer
        node containing 16 Alpha EV7 CPUs. The EV7 processor is based on
        the EV68 processor core that is used in terascale systems at Los
        Alamos National Laboratory and thePittsburgh Supercomputing
        Center. The EV68 processor core is supplemented with six-way
        router circuitry that forms connections from the processor
        internals to four neighboring CPUs in a two-dimensional torus,
        to a I/O controller and to local memory. The performance
        evaluation presented in this paper considers memory hierarchy,
        intra-node MPI communication, and also the performance of a
        number of complete applications. The measurements are compared
        with those taken on existing AlphaServer machines. It is clear
        from our analysis that the superior application performance of
        the EV7 relative to asimilarspeed EV68 is attributable to its
        excellent main memory bandwidth -over 4 GB/s."
}
@Article{SSrin2004,
       author   = "S. Srinivasan and R. S. Miller and E. Marotta",
       title    = "Parallel computation of the {B}oltzmann transport equation for microscale    heat transfer in multilayered thin films",
       journal  = "Numerical Heat Transfer Part B-Fundamentals",
       volume   = "46",
       number   = "1",
       pages    = "31--58",
       month    = JUL,
       year     = "2004",
       abstract = "Results are presented from parallel computations of
        one-dimensional (1-D) and two-dimensional (2-D) microscale heat
        conduction in multilayered films involving the materials silicon
        (Si) and silicon dioxide (SiO2). The equation of phonon
        radiative transport (ERPT), in its spectral as well as
        frequency-independent form, is considered for numerical modeling
        using finite-difference methods. Parallelization strategies
        based on Message Passing Interface (MPI) routines are explored
        in an effort to achieve computational efficiency. The numerical
        solution results address the effects of film thickness, grain
        boundary scattering, and interfacial boundary conditions on the
        time-dependent temperature distribution within the microscale
        films."
}
@Article{YYama2004,
       author   = "Y. Yamada and S. Maeda and T. Tsuchiya and N. Endoh",
       title    = "Analysis of sound field of aerial ultrasonic sensor by    three-dimensional finite-difference time-domain method using parallel    computing",
       journal  = "Japanese Journal of Applied Physics Part 1-Regular Papers      Short Notes \& Review Papers",
       volume   = "43",
       number   = "5B",
       pages    = "2869--2870",
       month    = MAY,
       year     = "2004",
       abstract = "The three-dimensional (3D) finite-difference time-domain (FDTD)
        method is applied to acoustics analysis in this paper. The
        disadvantages of its methodare large memory requirement and long
        calculation time. We developed the 3D-FDTD method using parallel
        computing with the Message-Passing Interface (MPI) in the
        personal computer (PC) cluster because it could shorten the
        calculation time and could treat larger memory. In this paper,
        basic algorithmof the proposed method is described. The acoustic
        fields of point sources and the real aerial sensor are
        calculated for verifying the validity of themethod."
}

@Article{GAlfo2004,
       author   = "G. Alfonsi and L. Muttoni",
       title    = "Performance evaluation of a {W}indows {NT} based {PC} cluster for high    performance computing",
       journal  = "Journal of Systems Architecture",
       volume   = "50",
       number   = "6",
       pages    = "345--359",
       month    = JUN,
       year     = "2004",
       abstract = "In recent times the computational power of personal computers
        has remarkably increased and the use of groups of PCs and
        workstations, connected by a network and dedicated to parallel
        computations, is today frequent. Computingclusters are mainly
        based on UNIX workstations and Linux PCs but, in the last few
        years, different implementations of message passing systems were
        made available also for Microsoft Windows. In this work we test
        the performance of two implementations of MPI for Windows
        platforms, and we compare the results with those obtained from
        Linux systems. "
}

@Article{NEdmu2004,
       author   = "N. Edmundsson and E. Elmroth and B. Kagstrom and M. Martensson and M. Nylen and  Sandgren",
       title    = "Design and evaluation of a {TOP}100 linux super cluster system",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "8",
       pages    = "735--750",
       month    = JUL,
       year     = "2004",
       abstract = "The High Performance Computing Center North (HPC2N) Super
        Cluster is a truly self-made high-performance Linux cluster with
        240 AMD processors in 120 dual nodes, interconnected with a
        high-bandwidth, low-latency SCI network. This contribution
        describes the hardware selected for the system, the work needed
        to build it, important software issues and an extensive
        performance analysis. The performance is evaluated using a
        number of state-of-the-art benchmarks and software, including
        STREAM, Pallas MPI, the Atlas DGEMM, High-Performance Linpack
        and NAS Parallel benchmarks. Using these benchmarks we first
        determine the raw memory bandwidth and network characteristics;
        the practical peak performance of a single CPU, a single
        dual-node and the complete 240-processor system; and investigate
        the parallel performance for non-optimized dusty-deck Fortran
        applications. In summary, this $500 000 systemis extremely
        cost-effective and shows the performance one would expect of a
        large-scale supercomputing system with distributed memory
        architecture. According to the TOP500 list of June 2002, this
        cluster was the 94th fastestcomputer in the world. It is now
        fully operational and stable as the main computing facility at
        HPC2N. The system's utilization figures exceed 90\%, i.e. all 240
        processors are on average utilized over 90\% of the time, 24
        hours a day, seven days a week. "
}
@Article{JAgra2004,
       author   = "J. Agrawal and T. V. Mathew",
       title    = "Transit route network design using parallel genetic algorithm",
       journal  = "Journal of Computing in Civil Engineering",
       volume   = "18",
       number   = "3",
       pages    = "248--256",
       month    = JUL,
       year     = "2004",
       abstract = "A transit route network design (TRND) problem for urban bus
        operation involves the determination of a set of transit routes
        and the associated frequencies that achieve the desired
        objective. This can be formulated as an optimization problem of
        minimizing the total system cost, which is the sum of the
        operating cost and the generalized travel cost. A review of
        previous approaches to solve this problem reveals the deficiency
        of conventional optimization techniques and the suitability of
        genetic algorithm (GA) based modelsto handle such combinatorial
        optimization problems. Since GAs are computationally intensive
        optimization techniques, their application to large and complex
        problems is limited. The computational performance of a GA model
        canbe improved by exploiting its inherent parallel nature.
        Accordingly, two parallel genetic algorithm (PGA) models are
        proposed in this study. The first is a global parallel virtual
        machine (PVM) parallel GA model where the fitness evaluation is
        done concurrently in a parallel processing environment using PVM
        libraries. The second is a global message passing interface
        (MPI)parallel GA model where an MPI environment substitutes for
        the PVM libraries. An existing GA model for TRND for a large
        city is used as a case study.These models are tested for
        computation time, speedup, and efficiency. From the study, it is
        observed that the global PVM model performed better thanthe
        other model."
}

@Article{FChan2004,
       author   = "F. Chan and J. N. Cao and A. T. S. Chan and M. Y. Guo",
       title    = "Programming support for {MPMD} parallel computing in {C}luster{GOP}",
       journal  = "IEICE Transactions on Information and Systems",
       volume   = "E87D",
       number   = "7",
       pages    = "1693--1702",
       month    = JUL,
       year     = "2004",
       abstract = "Many parallel applications involve different independent tasks
        with their own data. Using the MPMD model, programmers can have
        a modular view and simplified structure of the parallel
        programs. Although MPI supports both SPMD and MPMD models for
        programming, MPI libraries do not provide an efficient way for
        task communication for the MPMD model. We have developed a
        programming environment, called ClusterGOP, for building and
        developing parallel applications. Based on the graph-oriented
        programming (GOP) model, ClusterGOPprovides higher-level
        abstractions for message-passing parallel programming with the
        support of software tools for developing and running parallel
        applications. In this paper, we describe how ClusterGOP supports
        programming of MPMD parallel applications on top of MPI. We
        discuss the issues of implementing the MPMD model in ClusterGOP
        using MPI and evaluate the performanceby using example
        applications."
}

@Article{SWBai2004,
       author   = "S. W. Bai and C. S. Yang and T. C. Huang",
       title    = "Packing/unpacking using {MPI} user-defined datatypes for efficient data    redistribution",
       journal  = "IEICE Transactions on Information and Systems",
       volume   = "E87D",
       number   = "7",
       pages    = "1721--1728",
       month    = JUL,
       year     = "2004",
       abstract = "In many parallel programs, run-time data redistribution is
        usually requiredto enhance data locality and reduce remote
        memory access on the distributed memory multicomputers. Research
        on data redistribution algorithms has recently matured. The time
        required to generate data sets and processor sets is much lesser
        than before. Therefore, packing/unpacking has become a
        relatively high cost in redistribution. In this paper, we
        present methods to perform BLOCK-CYCLIC(s) to BLOCK-CYCLIC(t)
        redistribution, using MPI user-defined datatypes. This method
        reduces the required memory buffers and avoids unnecessary
        movement of data. Theoretical models are presented to determine
        the best method for redistribution. The methods were implemented
        on an IBM SP2 parallel machine to evaluate the performance of
        the proposed methods. The experimental results indicate that
        this approach can clearly improve the redistribution in most
        cases."
}

@Article{YMLia2004,
       author   = "Y. M. Li and S. M. Yu",
       title    = "A two-dimensional quantum transport simulation of nanoscale double-gate    {MOSFET}s using parallel adaptive technique",
       journal  = "IEICE Transactions on Information and Systems",
       volume   = "E87D",
       number   = "7",
       pages    = "1751--1758",
       month    = JUL,
       year     = "2004",
       abstract = "In this paper we apply a parallel adaptive solution algorithm to
        simulate nanoscale double-gate metal-oxide-semiconductor field
        effect transistors (MOSFETs) on a personal computer (PC)-based
        Linux cluster with the message passing interface (MPI)
        libraries. Based on a posteriori error estimation, thetriangular
        mesh generation, the adaptive finite volume method, the monotone
        iterative method, and the parallel domain decomposition
        algorithm, a set of two-dimensional quantum correction
        hydrodynamic (HD) equations is solvednumerically on our
        constructed cluster system. This parallel adaptive simulation
        methodology with 1-irregular mesh was successfully developed and
        applied to deep-submicron semiconductor device simulation in our
        recent work. A 10 nm n-type double-gate MOSFET is simulated with
        the developed parallel adaptive simulator. In terms of physical
        quantities and refined adaptive mesh, simulation results
        demonstrate very good accuracy and computational efficiency.
        Benchmark results, such as load-balancing, speedup, and parallel
        efficiency are achieved and exhibit excellent, parallel
        performance. On a 16nodes PC-based Linux cluster, the maximum
        difference among CPUs is less than 6\%. A 12.8 times speedup and
        80\% parallel efficiency are simultaneously attained with respect
        to different simulation cases."
}
@Article{HWang2004,
       author   = "H. Wang and M. Y. Guo and D. M. Wei",
       title    = "A parallel implementation of multi-domain high-order {N}avier-{S}tokes    equations using {MPI}",
       journal  = "IEICE Transactions on Information and Systems",
       volume   = "E87D",
       number   = "7",
       pages    = "1759--1765",
       month    = JUL,
       year     = "2004",
       abstract = "In this paper, Message Passing Interface (MPI) techniques are
        used to implement high-order full 3-D Navier-Stokes equations in
        multi-domain applications. A two-domain interface with
        five-point overlapping used previously is expanded to a
        multi-domain computation. There are normally two approaches for
        this expansion. One is to break up the domain into two parts
        through domain decomposition (say, one overlapping), then using
        MPI directives to further break up each domain into n parts.
        Another is to break the domain up into 2n parts with (2n - 1)
        overlappings. In our present effort, the latter approach is used
        and finite-size overlappings are employed to exchange data
        between adjacent multi-dimensional sub-domains. It is an
        alternative way to parallelize the high-order full 3-D
        Navier-Stokes equations into multi-domain applications without
        adding much complexity. Results with high-order boundary
        treatments show consistency among multi-domain calculations and
        single-domain results."
}
@Article{NWooa2004,
       author   = "N. Woo and H. S. Jung and H. Y. Yeom and T. Park and H. Park",
       title    = "M{PICH}-{GF}: {T}ransparent checkpointing and rollback-recovery for    grid-enabled {MPI} processes",
       journal  = "IEICE Transactions on Information and Systems",
       volume   = "E87D",
       number   = "7",
       pages    = "1820--1828",
       month    = JUL,
       year     = "2004",
       abstract = "Fault-tolerance is an essential feature of the distributed
        systems where the possibility of a failure increases with the
        growth of the system. In spite of extensive researches over two
        decades, fault-tolerance systems have not succeeded in practical
        use. It is due to the high overhead and the unhandiness of the
        previous fault-tolerance systems. In this paper, we propose
        MPICH-GF, a user-transparent checkpointing system for
        grid-enabled MPICH. Ourobjectives are to fill the gap between
        the theory and the practice of fault-tolerance systems, and to
        provide a checkpointing-recovery system for grids. To build a
        fault-tolerant MPICH version, we have designed task migration,
        dynamic process management, and atomic message transfer.
        MPICH-GF requires no modification of application source codes,
        and it affects the MPICH communication characteristics as less
        as possible. The features of MPICH-GF are that it supports the
        direct message transfer mode and that all of the implementation
        has been done at the lower layer, that is, the abstract
        devicelevel. We have evaluated MPICH-GF using NPB applications
        on Globus middleware."
}
@Article{GQZha2004,
       author   = "G. Q. Zhang and W. S. Zhang",
       title    = "Methods for wave equation prestack depth migration and numerical    experiments",
       journal  = "Science in China Series A-Mathematics",
       volume   = "47",
       number   = "",
       pages    = "suS",
       month    = APR,
       year     = "2004",
       abstract = "In this paper the methods of wave theory based prestack depth
        migration andtheir implementation are studied. Using the
        splitting of wave operator, the wavefield extrapolation
        equations are deduced and the numerical schemes are presented.
        The numerical tests for SEG/EAEG model with MPI are performedon
        the PC-cluster. The numerical results show that the methods of
        single-shot (common-shot) migration and synthesized-shot
        migration are of practicalvalues and can be applied to field
        data processing of 3D prestack depth migration."
}

@Article{OKLim2004,
       author   = "O. K. Lim and K. S. Hong and H. S. Lee and E. H. Choi",
       title    = "Initial design domain reset method for genetic algorithm with parallel    processing",
       journal  = "Ksme International Journal",
       volume   = "18",
       number   = "7",
       pages    = "1121--1130",
       month    = JUL,
       year     = "2004",
       abstract = "The Genetic Algorithm (GA), an optimization technique based on
        the theory of natural selection, has proven to be a relatively
        robust means of searching for global optimum. It converges to
        the global optimum point without auxiliary information such as
        differentiation of function. In the case of a complex problem,
        the GA involves a large population number and requires a lot of
        computing time. To improve the process, this research used
        parallel processing with several personal computers. Parallel
        process technique is classified into two methods according to
        subpopulation's size and number. One isthe fine-grained method
        (FGM), and the other is the coarse-grained method (CGM). This
        study selected the CGM as a parallel process technique because
        the load is equally divided among several computers. The given
        design domain should be reduced according to the degree of
        feasibility, because mechanical system problems have
        constraints. The reduced domain is used as an initial design
        domain. It is consistent with the feasible domain and the
        infeasible domain around feasible domain boundary. This parallel
        process used theMessage Passing Interface library."
}

%
% From Rolf Rabenseifner (in 1999, but sent in HTML instead of BibTeX format)

@InProceedings{raben99:mpi-perf,
  author = 	 {Rolf Rabenseifner},
  title = 	 {Automatic Profiling of {MPI} Applications with
     Hardware Performance Counters},
  Booktitle={Recent Advances in Parallel Virtual Machine and Message
                  Passing Interface. 6th European {PVM/MPI} Users'
                  Group Meeting},  
  pages =	 {35--42},
  year =	 1999,
  editor =	 {J. Dongarra et al.},
  number =	 1697,
  series =	 {LNCS},
  pspaper =     {http://www.hlrs.de/people/rabenseifner/publ/profiling_pvmmpi99_29.ps.gz},
  pdfpaper =    {http://www.hlrs.de/people/rabenseifner/publ/profiling_pvmmpi99_29.pdf},
  abstract = {This paper presents 
     an automatic counter instrumentation and profiling module added
     to the MPI library on Cray T3E and SGI Origin2000 systems. 
     A detailed summary of the hardware performance counters and the 
     MPI calls of any MPI production program is gathered during 
     execution and written in MPI_Finalize on a special syslog file. 
     The user can get the same information in a different file. 
     Statistical summaries are computed weekly and monthly.
     The paper describes experiences with this library on the Cray T3E 
     systems at HLRS Stuttgart and TU Dresden.  It focuses on
     the problems integrating the hardware performance counters into
     MPI counter profiling and presents first results with these 
     counters.
     Also, a second software design is described that allows the 
     integration of the profiling layer into a dynamic shared object 
     MPI library without consuming the user's PMPI profiling interface.}
}


@InProceedings{rgns99:mpi-perf,
  author = 	 {Rolf Rabenseifner and Peter Gottschling and Wolfgang
                  E. Nagel and Stephan Seidl},
  title = 	 {Effective Performance Problem Detection of {MPI}
                  Programs on {MPP} Systems: From the Global View to
                  the Detail},
  booktitle =	 {Proceedings of Parallel Computing 99 (ParCo99),
                  Delft, The Netherlands},
  year =	 1999,
  pspaper = {http://www.hlrs.de/people/rabenseifner/publ/profiling_parco99_4_llncs.ps.gz},
  abstract = {This paper presents
     an automatic counter instrumentation and profiling module added
     to the MPI library on Cray T3E systems.
     A detailed summary of the hardware performance counters and the
     MPI calls of any MPI production program is gathered during
     execution and written on a special syslog file.
     The user can get the same information in a different file.
     Statistical summaries are computed weekly and monthly.
     The paper describes experiences with this library on the Cray T3E 
     systems at HLRS Stuttgart and TU Dresden.
     It focuses on the scalability aspects of the new interface:
     How to get the right amount of performance data
     to the right person in time, and
     how to draw conclusions for the further optimization process,
     e.g. with the trace-based profiling tool VAMPIR.}
}



@InProceedings{,
  author = 	 {Rolf Rabenseifner},
  title = 	 {Automatic {MPI} Counter Profiling of All Users:
                  First Results on a {CRAY T3E} 900-512},
},
  OPTcrossref =  {},
  OPTkey = 	 {},
  OPTbooktitle = {Proceedings of the 
     Message Passing Interface 
     Developer's and User's Conference (MPIDC'99), Atlanta},
  OPTpages = 	 {77--85},
  OPTyear = 	 {1999},
  OPTeditor = 	 {},
  OPTvolume = 	 {},
  OPTnumber = 	 {},
  OPTseries = 	 {},
  OPTaddress = 	 {},
  OPTmonth = 	 MAR,
  OPTorganization = {},
  OPTpublisher = {},
  OPTnote = 	 {},
  OPTannote = 	 {},
  pspaper =      {http://www.hlrs.de/people/rabenseifner/publ/profiling_mpidc99_paper.ps.gz},
  abstract = {This paper presents 
     an automatic counter instrumentation and profiling module added
     to the MPI library on our Cray T3E. 
     A statistical summary of the MPI calls of any MPI partition
     is gathered during 
     execution and written in MPI_Finalize on a special syslog file.
     The user can get the same statistical information on a file. 
     Weekly and monthly a statistical summary is computed and the user
     specific part is sent by mail to each user.
     A summary of 6 month is presented in this paper. 
     It is the first time that all MPI applications on such a large 
     system where automatically instrumented and profiled for such a
     period. 
     The statistics give new insight in how efficiently the MPP system 
     is really used by the MPI applications.
     Moreover, it gives hints which application and which MPI routine 
     should be optimized. 
     The software is portable to other systems.}
}

 
 

@InProceedings{rab98:mpi-interop,
  author = 	 {Rolf Rabenseifner},
  title = 	 {{MPI-GLUE}: Interoperable high-performance {MPI} combining
     different vendor's {MPI} worlds},
  booktitle =	 {Proceedings of Euro-Par '98, Southhampton, UK},
  pages =	 {563--569},
  year =	 1998,
  series =	 {LNCS},
  month =	 SEP,
  publisher =	 {Springer-Verlag},
  pspaper = {http://www.hlrs.de/people/rabenseifner/publ/mpi_glue_europar98.ps.gz},
  abstract = {Several metacomputing projects try to implement MPI
     for homogeneous and heterogeneous clusters of parallel systems.
     MPI-GLUE is the first approach which exports nearly full MPI 1.1
     to the user's application without losing the efficiency of
     the vendors' MPI implementations.
     Inside of each MPP or PVP system the vendor's MPI implementation
     is used.  Between the parallel systems a slightly modified
     TCP-based MPICH is used, i.e. MPI-GLUE is a layer that combines
     different vendors' MPIs by using MPICH as a global communication
     layer.  Major design decisions within MPI-GLUE and other
     metacomputing MPI libraries (PACX-MPI, PVMPI, Globus and PLUS)
     and their implications for the programming model are compared.
     The design principles are explained in detail.}
}
 
 
     </B><BR> 

@TechReport{rab98:mpi-iterop-long,
  author = 	 {Rolf Rabenseifner},
  title = 	 {{MPI-GLUE}: Interoperable high-performance {MPI} combining
     different vendor's {MPI} worlds},
  institution =  {Computing Center University of Stuttgart},
  year = 	 1998,
  pspaper = {http://www.hlrs.de/people/rabenseifner/publ/mpi_glue_tr_apr98.ps.gz},
  abstract = {Several metacomputing projects try to implement MPI
     for homogeneous and heterogeneous clusters of parallel systems.
     MPI-GLUE is the first approach which exports nearly full MPI 1.1
     to the user's application without losing the efficiency of
     the vendors' MPI implementations.
     Inside of each MPP or PVP system the vendor's MPI implementation
     is used.  Between the parallel systems a slightly modified 
     TCP-based MPICH is used, i.e. MPI-GLUE is a layer that combines 
     different vendors' MPIs by using MPICH as a global communication 
     layer.  Major design decisions within MPI-GLUE and other 
     metacomputing MPI libraries (PACX-MPI, PVMPI, Globus and PLUS) 
     and their implications for the programming model are compared. 
     The design principles are explained in detail.}
}

% End of 1999 papers from Rabenseifner

@Article{BSchm2004,
       author   = "B. Schmidt and L. Feng and A. Laud and Y. Santoso",
       title    = "Development of distributed bioinformatics applications with {GMP}",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "9",
       pages    = "945--959",
       month    = AUG,
       year     = "2004",
       abstract = "We present the design and use of gMP as a tool for developing
        distributed bioinformatics applications. gMP is a purely
        Java-based interface that adds MPI-like message-passing and
        collective communication to the genomics Research Network
        Architecture (gRNA). The gRNA is a highly programmable,
        modularenvironment specifically designed to invigorate the
        development of genome-centric tools for life science research.
        We demonstrate the development of a distributed application to
        detect regulatory elements using correlation with gene
        expression data. Its implementation with gMP leads to
        significant runtime savings on our distributed gRNA system."
}
@Article{NJRiz2004,
       author   = "N. J. Rizk",
       title    = "Parallelization of {IBD} computation for determining genetic disease maps",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "9",
       pages    = "933--943",
       month    = AUG,
       year     = "2004",
       abstract = "A number of software packages are available for the construction
        of comprehensive human genetic maps. In this paper we
        parallelize the widely used package Genehunter. We restrict our
        attention to only one function of the package, namely the
        computations of Identity By Descent (IBD) genes of a family. We
        use a master-slave model with the Message Passing Interface
        parallel environment. Our tests are done on two different
        architectures: a network ofworkstations and a shared memory
        multiprocessor. A new and efficient strategy to classify the
        parallelization of genetic linkage analysis programs results
        from our experiments. The classification is based on values of
        parameters which affect the complexity of the computation."
}

@Article{KHyou2004,
       author   = "K. Hyoudou and R. Ozaki and Y. Nakayama",
       title    = "A {PC} cluster system employing {IEEE} 1394",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "10",
       pages    = "989--1003",
       month    = AUG,
       year     = "2004",
       abstract = "In this paper, we describe the design and evaluation of a PC
        cluster systemin which IEEE 1394 is applied. Networks for
        parallel cluster computing require low latency and high
        bandwidth. It is also important that the networksbe commercially
        available at low cost. Few network devices satisfy all of the
        above requirements. However, the IEEE 1394 standard provides a
        good compromise for fulfilling these requirements. We have used
        IEEE 1394 devices, which support a 400 Mbps data transfer rate,
        to connect the nodes of a PC cluster system which we have
        designed and implemented. We have implemented two communication
        libraries. One is a fast communication library called CF for
        IEEE 1394. The other is a MPI layer library on the CF library.
        Experimental results show that CF achieves a 17.2 microsecond
        round-trip time. On application benchmarks, the system was
        considerably faster than TCP/IP over Fast Ethernet. Even though
        the system was constructed at very low cost, it provides good
        performance. Using the IEEE 1394 standard is thus a good
        solution for low-cost cluster systems."
}

@Article{GRLue2004,
       author   = "G. R. Luecke and S. Spanoyannis and M. Kraeva",
       title    = "The performance and scalability of {SHMEM} and {MPI}-2 one-sided routines    on a {SGI} {O}rigin 2000 and a {C}ray {T}3{E}-600",
       journal  = "Concurrency and Computation-Practice \& Experience",
       volume   = "16",
       number   = "10",
       pages    = "1037--1060",
       month    = AUG,
       year     = "2004",
       abstract = "This paper compares the performance and scalability of SHMEM and
        MPI-2 one-sided routines on different communication patterns for
        a SGI Origin 2000 and a Cray TH-600. The communication tests
        were chosen to represent commonly used communication patterns
        with low contention (accessing distant messages, a circular
        right shift, a binary tree broadcast) to communication patterns
        with high contention (a 'naive' broadcast and an all-to-all).
        For all thetests and for small message sizes, the SHMEM
        implementation significantly outperformed the MPI-2
        implementation for both the SGI Origin 2000 and CrayT3E-600."
}
@Article{FItoa2004,
       author   = "F. Ito and N. Amemiya",
       title    = "Application of parallelized {SOR} method to electromagnetic field    analysis ofsuperconductors",
       journal  = "IEEE Transactions on Applied Superconductivity",
       volume   = "14",
       number   = "2",
       pages    = "1874--1877",
       month    = JUN,
       year     = "2004",
       abstract = "PC cluster systems are becoming popular in the field of
        high-performance computing. The authors have been studying the
        electromagnetic field analysis of high T-c superconductors (HTS)
        by the finite element method (FEM) for ACloss estimations.
        Superconductors are highly nonlinear electromagnetic media, and
        considerable computation time is required to calculate the
        temporalevolution of the electromagnetic field distribution in
        superconductors. Inthis study, parallel computing techniques
        were applied to the electromagnetic field analysis of HTSs by
        the finite element method. In one of the FEM codes used by the
        authors, the successive over-relaxation (SOR) method, wasused to
        solve a system of equations. This part was parallelized using
        the multicolor SOR method, and it helped in reducing computation
        time. Data areexplicitly passed between the processor elements
        (PEs) through an MPI. First, the HTS model was analyzed under
        one operating condition, using the code implemented multicolor
        SOR method, using four PEs, and the parallelization efficiency
        was confirmed. Next, the generality of the efficiency of the
        multicolor SOR method in our numerical simulation was examined
        under severaloperating conditions, using different models."
}

@Article{JSSim2004,
       author   = "J. S. Sims and N. Martys",
       title    = "Simulation of sheared suspensions with a parallel implementation of {QDPD}",
       journal  = "Journal of Research of the National Institute of Standards      and Technology",
       volume   = "109",
       number   = "2",
       pages    = "267--277",
       month    = "MAR-APR",
       year     = "2004",
       abstract = "A parallel quaternion-based dissipative particle dynamics (QDPD)
        program has been developed in Fortran to study the flow
        properties of complex fluids subject to shear. The
        parallelization allows for simulations of greater size and
        complexity and is accomplished with a parallel link-cell spatial
        (domain) decomposition using MPI. The technique has novel
        features arising fromthe DPD formalism, the use of rigid body
        inclusions spread across processors, and a sheared boundary
        condition. A detailed discussion of our implementation is
        presented, along with results on two distributed memory
        architectures. A parallel speedup of 24.19 was obtained for a
        benchmark calculation on 27 processors of a distributed memory
        cluster."
}
@Article{JKKoo2004,
       author   = "J. K. Koontz and S. C. Dey and S. Chatterjee and S. K. Dey",
       title    = "M{PI}-implementation of {PFI}-code for numerical modeling of the anatomy of    breast cancer",
       journal  = "International Journal of Computer Mathematics",
       volume   = "81",
       number   = "8",
       pages    = "991--999",
       month    = AUG,
       year     = "2004",
       abstract = "Large-scale parallelized distributed computing has been
        implemented in the message passing interface (MPI) environment
        to solve numerically, eight reaction-diffusion equations
        representing the anatomy and treatment of breast cancer. The
        numerical algorithm is perturbed functional iterations (PFI)
        which is completely matrix-free. Fully distributed computations
        with multipleprocessors have been implemented on a large scale
        in the serial PFI-code in the MPI environment. The technique of
        implementation is general and can be applied to any serial code.
        This has been validated by comparing the computed results from
        the serial code and those from the MPI-version of the parallel
        code."
}
@Article{YCotr2004,
       author   = "Y. Cotronis",
       title    = "Composition of message passing interface applications over {MPICH}-{G}2",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "18",
       number   = "3",
       pages    = "327--339",
       month    = FAL,
       year     = "2004",
       abstract = "Coupling grid applications requires code modification and high
        S/W engineering effort. We propose the Ensemble methodology in
        which message passing components are developed separately and
        applications, whether regular, irregular, SPMD or MPMD, are
        composed without component modification. Composed applications
        are pure Message Passing Interface programs running on
        MPICH-G2.We demonstrate our approach by developing two
        simplified atmospheric and ocean components, which may run on
        their own or coupled together (climate model) in any required
        configuration depending on geography or other design issues."
}
@Article{WGrop2004,
       author   = "W. Gropp and E. Lusk",
       title    = "Fault tolerance in {M}essage {P}assing {I}nterface programs",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "18",
       number   = "3",
       pages    = "363--372",
       month    = FAL,
       year     = "2004",
       abstract = "In this paper we examine the topic of writing fault-tolerant
        Message Passing Interface (MPI) applications. We discuss the
        meaning of fault tolerance in general and what the MPI Standard
        has to say about it. We survey several approaches to this
        problem, namely checkpointing, restructuring a class of standard
        MPI programs, modifying MPI semantics, and extending the MPI
        specification. We conclude that, within certain constraints, MPI
        can provide a useful context for writing application programs
        that exhibit significant degrees of fault tolerance."
}
@Article{OSiev2004,
       author   = "O. Sievert and H. Casanova",
       title    = "A simple {MPI} process swapping architecture for iterative applications",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "18",
       number   = "3",
       pages    = "341--352",
       month    = FAL,
       year     = "2004",
       abstract = "Parallel computing is now popular and mainstream, but
        performance and ease of use remain elusive to many end-users.
        There exists a need for performance improvements that can be
        easily retrofitted to existing parallel applications. In this
        paper we present MPI process swapping, a simple performance
        enhancing add-on to the MPI programming paradigm. MPI process
        swapping improves performance by dynamically choosing the best
        available resources throughout application execution, using MPI
        process over-allocation and real-timeperformance measurement.
        Swapping provides fully automated performance monitoring and
        process management, and a rich set of primitives to control
        execution behavior manually or through an external tool.
        Swapping, as defined in this implementation, can be added to
        iterative MPI applications and requires as few as three lines of
        source code change. We verify our design for a particle dynamics
        application on desktop resources within a production commercial
        environment."
}

@Article{GEFag2004,
       author   = "G. E. Fagg and J. J. Dongarra",
       title    = "Building and using a fault-tolerant {MPI} implementation",
       journal  = "International Journal of High Performance Computing      Applications",
       volume   = "18",
       number   = "3",
       pages    = "353--361",
       month    = FAL,
       year     = "2004",
       abstract = "In this paper we discuss the design and use of a fault-tolerant
        MPI (FT-MPI) that handles process failures in a way beyond that
        of the original MPI static process model. FT-MPI allows the
        semantics and associated modes of failures to be explicitly
        controlled by an application via a modified functionality within
        the standard MPI 1.2 API. Given is an overview of the FT-MPI
        semantics, architecture design, example usage and sample
        applications. A short discussion is given on the consequences of
        designing a fault-tolerant MPI both in terms of how such an
        implementation handles failures at multiple levels internally as
        well as how existing applications can use new featureswhile
        still remaining within the MPI standard."
}

@Article{KJere2004,
       author   = "K. Jeremy and S. Ahalt",
       title    = "Matlab{MPI}",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "64",
       number   = "8",
       pages    = "997--1005",
       month    = AUG,
       year     = "2004",
       abstract = "In many projects the true costs of high performance computing
        are currentlydominated by software. Addressing these costs may
        require shifting to higher level languages such as Matlab.
        MatlabMPI is a Matlab implementation of the Message Passing
        Interface (MPI) standard and allows any Matlab program to
        exploit multiple processors. MatlabMPI currently implements the
        basic six functions that are the core of the MPI point-to-point
        communications standard. The key technical innovation of
        MatlabMPI is that it implements the widely used MPI "look and
        feel" on top of standard Matlab file I/O, resulting in an
        extremely compact (similar to350 lines of code) and "pure"
        implementation which runs anywhere Matlab runs, and on any
        heterogeneous combination of computers. The performance has been
        tested on both shared and distributed memory parallel computers
        (e.g. Sun, SGI, HP, IBM, Linux, MacOSX and Windows). MatlabMPI
        can match the bandwidth of C based MPI at large message sizes. A
        test image filtering application using MatlabMPI achieved a
        speedupof similar to300 using 304 CPUs and similar to15\% of the
        theoretical peak (450 Gigaflops) on an IBM SP2 at the Maui High
        Performance Computing Center. In addition, this entire parallel
        benchmark application was implemented in 70
        software-lines-of-code, illustrating the high productivity of
        this approach."
}
@Article{SLLie2004,
       author   = "S. L. Liebling",
       title    = "The nonlinear sigma model with distributed adaptive mesh refinement",
       journal  = "Classical and Quantum Gravity",
       volume   = "21",
       number   = "16",
       pages    = "3995--4003",
       month    = AUG,
       year     = "2004",
       abstract = "An adaptive mesh refinement scheme is implemented in a
        distributed environment using message passing interface to find
        solutions to the nonlinear sigma model. In a previous work, I
        studied the behaviour similar to black hole critical phenomena
        at the threshold for singularity formation in this flat-space
        model. The present study is a follow-up describing extensions to
        distribute the grid hierarchy and presenting tests showing the
        correctness of the model."
}
@Article{TNaga2004,
       author   = "T. Nagata",
       title    = "Variable-gain constraint stabilization for general multibody systems    with applications",
       journal  = "Journal of Vibration and Control",
       volume   = "10",
       number   = "9",
       pages    = "1335--1357",
       month    = SEP,
       year     = "2004",
       abstract = "This paper presents a general and efficient formulation
        applicable to a vast variety of rigid and flexible multibody
        systems. It is based on a variable-gain error correction with
        scaling and adaptive control of the convergence parameter. The
        methodology has the following distinctive features. (i) All
        types of holonomic and non-holonomic equality constraints as
        well as a class of inequalities can be treated in a plain and
        unified manner. (ii) Stability of the constraints is assured.
        (iii) The formulation has an order N computational cost in terms
        of both the constrained and unconstrained degrees of freedom,
        regardless of the system topology. (iv) Unlike the traditional
        recursive order N algorithms, it is quite amenable to parallel
        computation. (v) Because virtually no matrix operations are
        involved, it can be implemented to very simple general-purpose
        simulation programs. Noting the advantages, the algorithm has
        been realized as a C++ code supporting distributedprocessing
        through the Message-Passing Interface (MPI). Versatility,
        dynamical validity and efficiency of the approach are
        demonstrated through numerical studies of several particular
        systems including a crawler and a flexible space structure."
}
@Article{HIwas2004,
       author   = "H. Iwasaki and Z. J. Hu",
       title    = "A new parallel skeleton for general accumulative computations",
       journal  = "International Journal of Parallel Programming",
       volume   = "32",
       number   = "5",
       pages    = "389--414",
       month    = OCT,
       year     = "2004",
       abstract = "Skeletal parallel programming enables programmers to build a
        parallel program from ready-made components (parallel
        primitives) for which efficient implementations are known to
        exist, making both the parallel program development and the
        parallelization process easier. Constructing efficient parallel
        programs is often difficult, however, due to difficulties in
        selecting a proper combination of parallel primitives and in
        implementing this combination Without having unnecessary
        creations and exchanges of data among parallelprimitives and
        processors. To overcome these difficulties, we propose a
        powerful and general parallel skeleton, accumulate, which can be
        used to naturally code efficient Solutions to problems as well
        as be efficiently implemented in parallel using Message Passing
        Interface (MPI)."
}
@Article{HTLiu2004,
       author   = "H. T. Liu and B. H. Li and D. S. Qi",
       title    = "Novel parallel acceleration technique for shooting-and-bouncing ray    launching algorithm",
       journal  = "Ieice Transactions on Electronics",
       volume   = "E87C",
       number   = "9",
       pages    = "1463--1466",
       month    = SEP,
       year     = "2004",
       abstract = "A novel parallel acceleration technique is proposed based on
        intrinsic parallelism characteristics of shooting-and-bouncing
        ray launching (SBR) algorithm, which has been implemented using
        the MPI parallel library on common PCcluster instead of
        dedicated parallel machines. The results reveal that the new
        technique achieves very large speedup gains and could be the
        efficient and low-cost propagation prediction solution."
}

@Article{NTyag2004,
       author   = "N. Tyagi and A. Bose and I. J. Chetty",
       title    = "Implementation of the {DPM} {M}onte {C}arlo code on a parallel architecture    for treatment planning applications",
       journal  = "Medical Physics",
       volume   = "31",
       number   = "9",
       pages    = "2721--2725",
       month    = SEP,
       year     = "2004",
       abstract = "We have parallelized the Dose Planning Method (DPM), a Monte
        Carlo code optimized for radiotherapy class problems, on
        distributed-memory processor architectures using the Message
        Passing Interface (MPI). Parallelization has been investigated
        on a variety of parallel computing architectures at the
        University of Michigan-Center for Advanced Computing, with
        respect to efficiency and speedup as a function of the number of
        processors. We have integrated the parallel pseudo random number
        generator from the Scalable Parallel Pseudo-Random Number
        Generator (SPRNG) library to run with the parallel DPM.The Intel
        cluster consisting of 800 MHz Intel Pentium III processor shows
        an almost linear speedup up to 32 processors for simulating 1 X
        10(8) or more particles. The speedup results are nearly linear
        on an Athlon cluster (up to 24 processors based on availability)
        which consists of 1.8 GHz+ Advanced Micro Devices (AMD) Athlon
        processors on increasing the problem size up to 8 X 10(8)
        histories. For a smaller number of histories (I X 108) the
        reduction of efficiency with the Athlon cluster (down to 83.9\%
        with 24 processors) occurs because the processing time required
        to simulate 1 X 10(8) histories is less than the time associated
        with interprocessor communication. Asimilar trend was seen with
        the Opteron Cluster (consisting of 1400 MHz, 64-bit AMD Opteron
        processors) on increasing the problem size. Because of the
        64-bit architecture Opteron processors are capable of storing
        and processing instructions at a faster rate and hence are
        faster as compared to the 32-bit Athlon processors. We have
        validated our implementation with an in-phantom dose calculation
        study using a parallel pencil monoenergetic electronbeam of 20
        MeV energy. The phantom consists of layers of water, lung, bone,
        aluminum, and titanium. The agreement in the central axis depth
        dose curves and profiles at different depths shows that the
        serial and parallel codes are equivalent in accuracy. "
}

@Article{TTamu2004,
       author   = "T. Tamura and G. H. Lu and R. Yamamoto and M. Kohyama and S. Tanaka and Y. Tateizumi",
       title    = "M{PI} parallelization of the first-principles pseudopotential method    program with respect to each band",
       journal  = "Modelling and Simulation in Materials Science and      Engineering",
       volume   = "12",
       number   = "5",
       pages    = "945--957",
       month    = SEP,
       year     = "2004",
       abstract = "We have developed an efficient parallelized program for
        large-scale calculations in the framework of the
        first-principles pseudopotential method. Thisprogram uses the
        residual minimization method-direct inversion in the iterative
        subspace method which was originally proposed by Pulay and
        recently implemented by Kresse and Furthmuller, to efficiently
        obtain the electronic ground state. This method is more suitable
        for parallel computations with respect to each band than other
        algorithms such as Car-Parrinello or conjugate-gradient schemes.
        We have tested the message passing interface (MPI)-parallel
        computations of the program applied to relatively large systems
        usingtwo kinds of parallel computer, the multinode supercomputer
        and the personal computer cluster. We have found that the
        developed program can attain a very high efficiency of
        parallelization for the two kinds of computers."
}

@Article{WSiri2004,
       author   = "W. Siripunvaraporn and M. Uyeshima and G. Egbert",
       title    = "Three-dimensional inversion for {N}etwork-{M}agnetotelluric data",
       journal  = "Earth Planets and Space",
       volume   = "56",
       number   = "9",
       pages    = "893--902",
       month    = "",
       year     = "2004",
       abstract = "Three-dimensional inversion of Network-Magnetotelluric (MT) data
        has been implemented. The program is based on a conventional 3-D
        MT inversion code (Siripunvaraporn et at., 2004), which is a
        data space variant of the OCCAM approach. In addition to
        modifications required for computing Network-MT responses and
        sensitivities, the program makes use of Massage Passing
        Interface(MPI) software, with allowing computations for each
        period to be run on separate CPU nodes. Here, we consider
        inversion of synthetic data generated from simple models
        consisting of a 1 Omega-m conductive block buried at varying
        depths in a 100 Omega-m background. We focus in particular on
        inversionof long period (320-40,960 seconds) data, because
        Network-MT data usually have high coherency in these period
        ranges. Even with only long period datathe inversion recovers
        shallow and deep structures, as long as these are large enough
        to affect the data significantly. However, resolution of the
        inversion depends greatly on the geometry of the dipole network,
        the range ofperiods used, and the horizontal size of the
        conductive anomaly."
}

@Article{ZQChe2004,
       author   = "Z. Q. Chen and A. Delis and H. L. Bertoni",
       title    = "Radio-wave propagation prediction using ray-tracing techniques on a    networkof workstations ({NOW})",
       journal  = "Journal of Parallel and Distributed Computing",
       volume   = "64",
       number   = "10",
       pages    = "1127--1156",
       month    = OCT,
       year     = "2004",
       abstract = "We also address issues regarding main memory consumption,
        intermediate dataassembly, and final prediction generation. We
        implement our proposed computational model on a NOW
        configuration by using the message passing interface (MPI)
        standard. Our experiments with real and synthetic building and
        terrain databases show that, when no constraint is imposed on
        the main memory consumption, the proposed prediction model
        performs very well and achieves nearly linear speedups under
        various workload. When main memory consumption is a concern, our
        model still delivers very promising performance rates provided
        that the complexity of the involved computation is high, so that
        the extra computation and communication overhead introduced by
        the proposed model do not dominate the original computation. The
        accuracy of prediction results and the achievable speedup rates
        can be significantly improved when 3Dbuilding and terrain
        databases are used and/or diffuse scattering effect is taken
        into account. "
}

@Article{FDeis2004,
       author   = "F. Deister and U. Tremel and O. Hassan and N. P. Weatherill",
       title    = "Fully automatic and fast mesh size specification for unstructured mesh    generation",
       journal  = "Engineering with Computers",
       volume   = "20",
       number   = "3",
       pages    = "237--248",
       month    = "",
       year     = "2004",
       abstract = "A fully automatic surface mesh generation system is presented in
        this paper. The automation is achieved by an automatic
        determination of a consistent mesh size distribution, which is
        based on geometry rasterisation. The user specifies a minimal
        and maximal allowed mesh size, and a maximal allowed curvature
        angle for the complete geometry, or, rather, parts of it. Now,
        these local curvature and local characteristic lengths of the
        geometry are computed, which determine the local mesh size.
        These local mesh sizes are stored and smoothed in a Cartesian
        background mesh. Afterwards, the triangulation is generated by
        an advancing front triangulator: the local resolution of the
        surface triangulation is determined by the mesh sizes stored in
        the Cartesian background mesh. The object-oriented design and
        implementation is described. The complete system is very fast
        due to an efficient parallelisation based on MPI for computer
        systems with distributed memory."
}
