% Papers using MPI
%
% This is a partial list, begun in late October, 1997. It is intended to give
% an example of the range of applications that are known to be using
% MPI.
% Note that some author lists are incomplete; if you have a more
% complete reference, please send it to gropp mcs.anl.gov .



@Article{CooFinTseYor97:mpi-groups,
author = {G. Cooperman and L. Finkelstein and M. Tselman and B. York},
title = {Constructing permutation representations for matrix groups},
journal = {Journal of Symbolic Computation},
year = 1997,
volume = 24,
number = {3--4},
month = {Sept.-Oct.},
pages = {471--488},
abstract = {The theory has been successfully tested on a representation of the sporadic simple group Ly, discovered by Lyons (1972). With no a priori assumptions, we find a permutation representation of degree 9606125 on a conjugacy class of subgroups of order 3, find the order of the resulting permutation group, and verify simplicity A Monte Carlo variation of the algorithm was used to achieve better space and time efficiency. The construction of the permutation representation required four CPU days on a SPARC-server 670MP with 64 MB. The permutation representation was used implicitly in the sense that the group element was stored as a matrix, and its permutation action on a ''point'' was determined using a pre-computed data structure. Thus, additional computations required little additional space. The algorithm has also been implemented using the MasPar MP-1 SIMD parallel computer and 8 SPARC-2's running under MPI. The results of those parallel experiments are briefly reviewed.}
}


@Article{AhuLon97:mpi-rk-scattering,
author = {V. Ahuja and L. N. Long},
title = {A parallel finite-volume {R}unge-{K}utta algorithm for electromagnetic scattering},
journal = {Journal of Computational Physics},
year = 1997,
volume = 137,
number = 2,
month = NOV,
pages = {299--320},
abstract = {A 3D explicit finite volume algorithm has been developed to simulate scattering from complex geometries on parallel computers using structured body conformal curvilinear grids. Most simulations for practical 3D geometries require a large number of grid points for adequate spatial resolution making them suitable to parallel computation. The simulations have been carried out using a multi-block/zonal approach in the message passing paradigm on the SP-2. Each zone is placed on a separate processor and interprocessor communication is carried out using the Message Passing Library/Interface (MPL/MPI). Integration of Maxwell's equations is performed using the four-stage Runge-Kutta time integration method on a dual grid. This method of integrating on a staggered grid gives enhanced dissipative and dispersive characteristics. A scattered field formulation has been used and the Liao boundary condition is used at the outer nonreflecting boundary. The far zone transformation has also been implemented efficiently, using specialized MPL functions to evaluate the far zone scattering results. Results show extremely good comparisons for scattering from the sphere and the ogive with the exact solution and standard FDTD type algorithms. Comparisons for nonaxisymmetric targets like the NASA almond with experimental data has also been found to be extremely good.}
}

@Article{GorBi98,
author = "S. Gorlatch and H. Bischof",
title = "A Generic MPI Implementation for a Data-Parallel Skeleton: Formal Derivation and Application to FFT",
journal = "Parallel Processing Letters",
volume = 8,
number = 4,
month = DEC,
year = 1998,
pages = {447--458},
abstract = "We derive a provably correct, architecture-independent family of parallel implementations for a class of data-parallel algorithms, called DH (distributable homomorphisms). The implementations are well-structured SPMD programs with group-wise personalized all-to-all exchange, directly realizable in MPI. As a case study, we systematically adjust the mathematical specification of the Fast Fourier Transform (FFT) to the DH format and, thereby, obtain a generic SPMD implementation for FFT. The target program includes FFT solutions used in practice -- the binary-exchange and the 2D- and 3D-transpose -- as special cases."
}
@Article{YevCinZhu98:mpi-groundwatersim,
author = {G. Yevi and P. Cinnella and X. Zhuang},
title = {On parallelizing a groundwater pollution simulator},
journal = {Applied Mathematics and Computation},
year = 1998,
volume = 89,
number = {1-3},
month = {Jan.-Feb.},
pages = {313--325},
abstract = {Domain decomposition strategies and computational mesh reordering are discussed for finite difference parallel simulations of groundwater contaminants transport. The parallel performance of point iterative methods traditionally used in groundwater pollution modelling is studied. The algorithms were implemented with red-black and wavefront reordering of the computational mesh. A standard conservative transport equation defined on a two-dimensional grid with Dirichlet boundary conditions was used for the analysis. Completely portable multiple instructions multiple data (MIMD) implementations of the algorithm were performed using message-passing interface (MPI). The runtimes of the algorithms are presented as a function of grid refinement and number of processors, and the communication overhead of the parallel simulation process is investigated, showing that the red-black reordering technique yields the best performance results. The method also provides higher efficiency and scalability when applied to large-scale problems. Optimal parameters are suggested for parallel simulation of groundwater pollution using finite difference schemes.}
}



@Article{Ian97:mpi-reducescatter,
author = {G. Iannello},
title = {Efficient algorithms for the reduce-scatter operation in {LogGP}},
journal = {IEEE Transactions on Parallel and Distributed Systesm},
year = 1997,
volume = 8,
number = 9,
month = SEP,
pages = {970--982},
abstract = {We consider the problem of efficiently performing a reduce-scatter operation in a message passing system. Reduce-scatter is the composition of an element-wise reduction on vectors of n elements initially held by n processors, with a scatter of the resulting vector among the processors. In this paper, we present two algorithms for the reduce-scatter operation, designed in LogGP. The first algorithm assumes an associative and commutative reduction operator and it is optimal in LogGP within a small constant factor. The second algorithm allows the reduction operator to be noncommutative, and it is asymptotically optimal when values to be combined are large arrays. To achieve these results, we developed a complete analysis of both algorithms in LogGP, including the derivation of lower bounds for the reduce-scatter operation, and the study of the m-item version of the problem, i.e., the case when the initial elements are vectors themselves. Reduce-scatter has been included as a collective operation in the MPI standard message passing library, and can be used, for instance, in parallel matrix-vector multiply when the matrix is decomposed by columns. To model a message passing system, we adopted the LogGP model, an extension of LogP that allows the modeling of messages of different length. While this choice makes the analysis somewhat more complex, it leads to more realistic results in the case of gather/scatter algorithms.}
}



@Article{YuaSalBalMel97:mpi-load-balancing,
author = {X. Yuan and G. Salisbury and D. Balsara and R. Melhem},
title = {A load balancing package on distributed memory systems and its application to particle-particle particle-mesh ({P3M}) methods},
journal = {Parallel Computing},
year = 1997,
volume = 23,
number = 10,
month = NOV,
pages = {1525--1544},
abstract = {We present a tool, Bisect, for balanced decomposition of spatial domains. In addition to applying a nested bisection algorithm to determine the boundaries of each subdomain, Bisect replicates a user specified zone along the boundaries of the subdomain in order to minimize future interactions between subdomains, Results of running the tool on the Cray T3D system using both shared memory operations and MPI communications are reported and discussed. In addition, Bisect is used in a parallel implementation of a particle-particle/particle-mesh (P3M) simulation program on the Cray T3D system. The performance of the P3M program with different load-balancing criteria is evaluated and compared. The results show that the use of the Bisect package balances the load efficiently and minimizes communication on the T3D massively parallel system.}
}


@Article{FosKohKriCho97:mpi-task-parallel,
author = {I. Foster and D. R. Kohr and R. Krishnaiyer and A. Choudhary},
title = {A library-based approach to task parallelism in a data-parallel language},
journal = {Journal of Parallel and Distributed Computing},
year = 1997,
volume = 45,
number = 2,
month = SEP,
pages = {148--158},
abstract = {Pure data-parallel languages such as High Performance Fortran version 1 (HPF) do not allow efficient expression of mixed task/data-parallel computations or the coupling of separately compiled data-parallel modules, In this paper, we show how these common parallel program structures can be represented, with only minor extensions to the HPF model, by using a coordination library based on the Message Passing Interface (MPI). This library allows data-parallel tasks to Exchange distributed data structures using falls to simple communication functions. We present microbenchmark results that characterize the performance of this library and that quantify the impact of optimizations that allow reuse of communication schedules in common situations, In addition, results from two-dimensional FFT, convolution, and multiblock programs demonstrate that the HPF/MPI library can provide performance superior to that of pore HPF, We conclude that this synergistic combination of two parallel programming standards represents a useful approach to task parallelism in a data-parallel framework, increasing the range of problems addressable in HPF without requiring complex compiler technology.}
}


@Article{BruGehRei97:mpi-resource-mgmt,
author = {M. Brune and J. Gehring and A. Reinefeld},
title = {Heterogeneous message passing and a link to resource management},
journal = {Journal of Supercomputing},
year = 1997,
volume = 11,
number = 4,
pages = {355--369},
abstract = {PLUS is a light-weight, extensible and efficient communication interface. with only four commands, PLUS is almost transparent to the application code. Our current implementation supports inter-process communication between PVM, MPI and PARIX, but it can be easily extended to other vendor-specific message passing Libraries. As PLUS has been designed for wide area networks, much effort has been spent on portability and on optimizing the communication speed across internet and also intranet links.}
}


@Article{Hor97,
author = {K. Hori},
title = {Supercomputer {SX-4} multinode system},
journal = {NEC Research \& Development},
year = 1997,
volume = 38,
number = 4,
pages = {461--473},
abstract = {The NEC supercomputer SX-4 multinode system series consists of two models, one being HIPPI (High Performance Parallel Interface)-connected model and the other IXS (Internode Crossbar Switch)-connected model. With the IXS, a proprietary high-speed crossbar switch, the HPC (High Performance Computing) up to 1 TFLOPS (Tera Flops) has been enabled by providing the most comprehensive environment for distributed parallel processing. This also means the world's first implementation of a clustered parallel processing. In this paper, we describe the functions of IXS hardware, the new operating system functions, MPI/SX the MPI (Message Passing Interface) processor and NQS/MPI which supports the close cooperation between NQS (Network Queuing System) batch processing system and MPI.}
}


@Article{Fac97:mpi-load-balance,
author = {A. Fachat and K. H. Hoffmann},
title = {Implementation of ensemble-based simulated annealing with dynamic load balancing under {MPI}},
journal = {Computer Physics Communications},
year = 1997,
volume = 107,
number = {1--3},
month = DEC,
pages = {49--53},
abstract = {This paper describes an implementation of Ensemble Based Simulated Annealing (EBSA) with dynamic load balancing. It is running under the MPI Message Passing Library allowing parallel execution on various types of computers. The load balancing is used to get maximum use of the available processing power, even on heterogeneous workstation clusters where the machines differ a lot in computing power.}
}


@Article{BarHau98:mpi-app,
author = {E. Baron and P. H. Hauschildt},
title = {Parallel implementation of the phoenix generalized stellar atmosphere program. {II}. Wavelength parallelization},
journal = {Astrophysical Journal},
year = 1998,
volume = 495,
number = {1 part 1},
month = MAR,
pages = {370--376},
abstract = {We describe an important addition to the parallel implementation of our generalized nonlocal thermodynamic equilibrium (NLTE) stellar atmosphere and radiative transfer computer program PHOENIX. In a previous paper in this series we described data and task parallel algorithms we have developed for radiative transfer, spectral line opacity, and NLTE opacity and rate calculations. These algorithms divided the work spatially or by spectral lines, that is, distributing the radial zones, individual spectral lines, or characteristic rays among different processors and employ, in addition, task parallelism for logically independent functions (such as atomic and molecular line opacities). For finite, monotonic velocity fields, the radiative transfer equation is an initial value problem in wavelength, and hence each wavelength point depends upon the previous one. However, for sophisticated NLTE models of both static and moving atmospheres needed to accurately describe, e.g., novae and supernovae, the number of wavelength points is very large (200,000-300,000) and hence parallelization over wavelength can lead both to considerable speedup in calculation time and the ability to make use of the aggregate memory available on massively parallel supercomputers. Here, we describe an implementation of a pipelined design for the wavelength parallelization of PHOENIX, where the necessary data from the processor working on a previous wavelength point is sent to the processor working on the succeeding wavelength point as soon as it is known. Our implementation uses a MIMD design based on a relatively small number of standard message passing interface (MPI) library calls and is fully portable between serial and parallel computers.}
}

@Article{Yas98:complex-flows,
author = {O. Yasar},
title = {A scalable model for complex flows},
journal = {Computers and Mathematics with Applications},
year = 1998,
volume = 35,
number = 7,
month = APR,
pages = {117-128},
abstract = {We describe a scalable parallel algorithm for numerical simulations of turbulent, radiative, magnetized, and reactive fluid + particle systems on message-passing distributed-memory computers. Accurate simulation of such complex flows has applications in engine combustion, industrial pulverized coal burners, astrophysics, inertial confinement fusion, nuclear systems, and many other strategically and economically important areas. Our algorithm has been developed based on a widely-used combustion code KIVA-3, a plasma and radiation hydrodynamics code R-MHD, a classical particle dynamics code CMDT, and a discrete ordinates particle transport code TORT. The development is being done on the Intel Paragon with PVM and MPI extensions. We report high levels of parallel efficiency and scalability (up to 1024 nodes) for a baseline engine test case, using our current message-passing reactive and turbulent flow code. The three-dimensional extension of radiation magnetohydrodynamics component is still being worked at and we hope to report further progress in the future.}
}


@Article{LepSchHei98:reactive-flow,
author = {J. Lepper and U. Schnell and K. R. G. Hein},
title = {Parallelization of a simulation code for reactive flows on the Intel Paragon},
journal = {Computers and Mathematics with Applications},
year = 1998,
volume = 35,
number = 7,
month = APR,
pages = {101-109},
abstract = {The paper shows the implementation of a 3D simulation code for turbulent how and combustion processes in full-scale utility boilers on an Intel Paragon XP/S computer. For the portable parallelization, an explicit approach is chosen using a domain decomposition method for the static subdivision of the numerical grid together with the SPMD programming model. The measured speedup for the presented case using a coarse grid is good, although some numerical requirements restrict the implemented message passing to strongly synchronized communication. On the Paragon, the NX message passing library is used for the computations. Furthermore, MPI and PVM are applied and their pros and cons on this computer are described. In addition to the basic message passing techniques for local and global communication, other possibilities are investigated. Besides the applicability of the vectorizing capability of the compiler, the influence of the I/O performance during computations is demonstrated. The scalability of the parallel application is presented for a refined discretization.}
}


@Article{Gor98:fft,
author = {S. Gorlatch},
title = {Programming with divide-and-conquer skeletons: A case study of {FFT}},
journal = {Journal of Supercomputing},
year = 1998,
volume = 12,
number = {1-2},
pages = {85-97},
}

@Article{Hio98:qcd,
author = {S. Hioki},
title = {{QCDMPI}---pure {QCD} Monte Carlo Simulation code with MPI},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 1998,
volume = 63,
month = APR,
pages = {1000--1002},
abstract = {In this paper, outline of QCDMPI is reported. Comparison of the performances on several parallel machines; AP1000, AP1000+, AP3000, Cenju-3, Paragon, SR2201 and Workstation Cluster, is also reported.}
}


@Article{Han98:mpi-eval,
author = {P. B. Hansen},
title = {An evaluation of the message-passing interface},
journal = {ACM Sigplan Notices},
year = 1998,
volume = 33,
number = 3,
month = MAR,
pages = {65--72},
abstract = {The Message-Passing Interface (MPI) is evaluated by rewriting message parallel programs for Householder reduction, matrix multiplication, and successive overrelaxation. The author concludes that MPI is a practical programming tool. It does, however, lack the elegance and security that can only be achieved by a parallel programming language.}
}


@Article{Iss98:cfd-precond,
author = {E. Issman},
title = {Non-overlapping preconditioners for a parallel implicit Navier-Stokes solver},
journal = {Future Generation Computer Systems},
year = 1998,
volume = 13,
number = {4--5},
month = MAR,
pages = {303-313},
abstract = {Parallel implicit iterative solution techniques are considered for application to a compressible hypersonic Navier-Stokes solver on unstructured meshes. The construction of parallel preconditioners with quasi-optimal convergence properties with respect to their serial counterpart is a key issue in the design of modern parallel implicit schemes, Two types of non-overlapping preconditioners are presented and compared. The first one is an additive Schwarz preconditioner requiring overlapping of the mesh and the second one is based on a Schur complement formulation. Both are using incomplete LU factorisation at the subdomain level but scale differently. Results are presented for computations on the Cray T3D under the message passing interface MPI. }
}


@Article{Bar98:migration,
author = {A. Barak},
title = {The MOSIX multicomputer operating system for high performance cluster computing},
journal = {Future Generation Computer Systems},
year = 1998,
volume = 13,
number = {4--5},
month = MAR,
pages = {361-372},
abstract = {The scalable computing cluster at Hebrew University consists of 88 Pentium II and Pentium-Pro servers that are connected by fast Ethernet and the Myrinet LANs. It is running the MOSIX operating system, an enhancement of BSD/OS with algorithms for adaptive resource sharing, that are geared for performance scalability in a scalable computing cluster. These algorithms use a preemptive process migration for load-balancing and memory ushering, in order to create a convenient multiuser time-sharing execution environment for HPC, particularly for applications that are written in PVM or MPI. This paper begins with a brief overview of MOSIX and its resource sharing algorithms. Then the paper presents the performance of these algorithms as well as the performance of several large-scale, parallel applications.}
}


@Article{Rei97:interop,
author = {A. Reinefeld},
title = {Communicating across parallel message-passing environments},
journal = {Journal of Systems Architecture},
year = 1997,
volume = 44,
number = {3--4},
month = DEC,
pages = {261--272},
abstract = {We present a small, extensible interface for the transparent communication between vendor-specific and standard message-passing environments. With only four new commands, existing parallel applications can make use of our PLUS communication interface, thereby allowing inter-process communication with other programming environments. Much effort has been spent in optimizing the communication speed across Internet and Intranet links. Our current implementation supports process communication between PVM, MPI, and PARIX. With only marginal additional effort, the interface can be adapted to support other message-passing environments as well.}
}

@Article{hom97:mpi-maxcup,
author = {S. Homer},
title = {Design and performance of parallel and distributed approximation algorithms for maxcut},
journal = {Journal of Parallel and Distributed Computing},
year = 1997,
volume = 41,
number = 1,
pages = {48--61},
month = OCT,
abstract = { We develop and experiment with a new parallel algorithm to approximate the maximum weight cut in a weighted undirected graph, Our implementation starts with the recent (serial) algorithm of Goemans and Williamson for this problem, We consider several different versions of this algorithm, varying the interior-point part of the algorithm in order to optimize the parallel efficiency of our method, Our work aims for an efficient, practical formulation of the algorithm with close-to-optimal parallelization. We analyze our parallel algorithm in the LogP model and predict linear speedup for a wide range of the parameters, We have implemented the algorithm using the message passing interface (MPI) and run it on several parallel machines. In particular, we present performance measurements on the IBM SP2, the Connection Machine CM5, and a cluster of workstations, We observe that the measured speedups are predicted well by our analysis in the LogP model, Finally, we test our implementation on several large graphs (up to 13,000 vertices), particularly on large instances of the Ising model.}
}

@Article{War:mpi-cluster,
author = {T. M. Warschko},
title = {ParaStation: Efficient parallel computing by clustering workstations: Design and evaluation},
journal = {Journal of Systems Architecture},
year = 1997,
volume = 44,
number = {3--4},
pages = {241--260},
month = DEC,
abstract = {ParaStation is a communications fabric for connecting off-the-shelf workstations into a supercomputer. The fabric employs technology used in massively parallel machines and scales up to 4096 nodes, ParaStation's user-level message passing software preserves the low latency of the fabric by taking the operating system out of the communication path, while still providing full protection in a multiprogramming environment. The programming interface presented by ParaStation consists of a UNIX socket emulation and widely used parallel programming environments such as PVM, P4, and MPI. Implementations of ParaStation using various platforms, such as Digitals AlphaGeneration workstations and Linux PCs, achieve end-to-end (process-to-process) latencies as low as 2 mu s and a sustained bandwidth of up to 15 Mbyte/s per channel, even with small packets. Benchmarks using PVM on ParaStation demonstrate real application performance of 1 GFLOP on an 8-node cluster.}
}

@Article{War98:mpi-cluster,
author = {T. M. Warschko},
title = {The {ParaStation} project: Using workstations as building blocks for parallel computing},
journal = {Information Sciences},
year = 1998,
volume = 106,
number = {3--4},
pages = {277--292},
month = MAY,
abstract = {The ParaStation communication fabric provides a high-speed communication network with user-level access to enable efficient parallel computing on workstation clusters. The architecture, implemented on off-the-shelf workstations coupled by the ParaStation communication hardware, removes the kernel and common network protocols from the communication path while still providing full protection in a multiuser, multiprogramming environment. The programming interface presented by ParaStation consists of a UNIX socket emulation and widely used parallel programming environments such as PVM, P4, and MPI. This allows porting a wide range of client/server and parallel applications to the ParaStation architecture. Implementations of ParaStation using various platforms, such as Digital's AlphaGeneration workstations and Linux PCs, achieve end-to-end (process-to-process) latencies as low as 2 mu s and a sustained bandwidth of up to 15 Mbyte/s per channel with small packets. Benchmarks using PVM on ParaStation demonstrate real application performance of 1 GFLOP on an 8-node cluster. }
}

@Article{Dan98:mpi-scheduling,
author = {M. A. R. Dantas},
title = {Efficient scheduling of {MPI} applications on networks of workstations},
journal = {Future Generation Computer Systems},
year = 1998,
volume = 13,
number = 6,
pages = {489--499},
month = MAY,
abstract = {The availability of a large number of workstations connected through a network can represent an attractive option for high-performance computing for many applications. The message-passing interface (MPI) software environment is an effort from many organisations to define a de facto message-passing standard. In other words, the original specification was not designed as a comprehensive parallel programming environment and some researchers agree that the standard should be preserved as simple and clean as possible. Nevertheless, a software environment such as MPI should have somehow a scheduling mechanism for the effective submission of parallel applications on network of workstations. This paper presents an alternative lightweight approach called Selective-MPI (S-MPI), which was designed to enhance the efficiency of the scheduling of applications on an MPI implementation environment.}
}

@Article{Cou98:mpi-c++,
author = {O. Coulaud},
title = {Para++: A high level {C++} interface for message passing},
journal = {Journal of Parallel and Distributed Computing},
year = 1998,
volume = 51,
number = 1,
pages = {46--62},
month = MAY,
abstract = {This paper describes a high level C++ interface for message passing applications. Our interface is built on top of PVM and MPI. The two main contributions are to allow a quicker design of parallel applications without any important drop of performances. We introduce two levels of tasks and use C++ streams for communications. We also present a performance study over both PVM and MPI to show the overhead of our implementation. Finally, we detail two applications based on the heat equation to explain how lPara++ call be used for SPMD and MPMD applications.}
}


@Article{Sal98:mpi-genetic,
author = {A. Salhi},
title = {Parallel implementation of a genetic-programming based tool for symbolic regression},
journal = {Information Processing Letters},
year = 1998,
volume = 66,
number = 6,
pages = {299-307},
month = JUN,
abstract = {We report on a parallel implementation of a tool for symbolic regression, the algorithmic mechanism of which is based on genetic programming, and communication is handled using MPI. The implementation relies on a random islands model (RIM), which combines both the conventional islands model where migration of individuals between islands occurs periodically and niching where no migration takes place. The system was designed so that the algorithm is synergistic with parallel/distributed architectures, and works to make use of processor time and minimum use of network bandwidth without complicating the sequential algorithm significantly. Results on an IBM SP2 are included. }
}

@Article{Har98:mpi-application,
author = {H. K. Harbury},
title = {Parallel computation for electronic waves in quantum corrals},
journal = {VLSI Design},
year = 1998,
volume = 6,
number = {1--4},
pages = {57--51},
abstract = {Recent scanning tunneling microscopy (STM) studies on the (111) faces of noble metals have directly imaged electronic surface-confined states and dramatic standing-wave patterns have been observed [1,2]. We solve for the local density of electronic states in these ''leaky'' quantum corral confinement structures using a coherent elastic scattering theory. We seek solutions of the two-dimensional Schrodinger equation compatible with non-reflecting boundary conditions which asymptotically satisfy the Sommerfeld radiation condition [11,14]. The large matrices generated by the discretization of realistic quantum corral structures require the use of sparse matrix methods. In addition, a parallel finite element solution was undertaken using the message passing interface standard (MPI) and the Portable, Extensible, Toolkit for Scientific Computation (PETSc) [5] for an efficient computational solution on both distributed and shared memory architectures. Our calculations reveal excellent agreement with the reported experimental dI/dV STM data.}
}

@Article{Jak98:mpi-application,
author = {U. Jakobus},
title = {Analysis of electromagnetic scattering problems by an iterative combination of {MoM} with {GMT} using {MPI} for the communication},
journal = {Microwave and Optical Technology Letters},
year = 1998,
volume = 19,
number = 1,
pages = {1--4},
month = SEP,
abstract = {A hybrid method is proposed combining the method of moments (MoM) with the generalized multipole technique (GMT) for the efficient analysis of electromagnetic radiation and scattering problems involving metallic as well as dielectric bodies. An iterative coupling scheme is applied so that only some small changes to the MoM and GMT formulations are required, making it very attractive for the combination of already existing MoM and GMT codes. During the iteration, the MoM and GMT processes are executed in parallel, and communication is done using the message-passing interface (MPI).}
}

@Article{Ril98:mpi-application,
author = {C. J. Riley},
title = {Distributed-memory computing with the {L}angley {A}erothermodynamic {U}pwind {R}elaxation {A}lgorithm {(LAURA)}},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
pages = {317--324},
month = APR-JUL,
abstract = {The Langley Aerothermodynamic Upwind Relaxation Algorithm (LAURA), a Navier-Stokes solver, has been modified for use in a parallel, distributed-memory environment using the Message-Passing Interface (MPI) standard. A standard domain decomposition strategy is used in which the computational domain is divided into subdomains with each subdomain assigned to a processor. Performance is examined on dedicated parallel machines and a network of desktop workstations. The effect of domain decomposition and frequency of boundary updates on performance and convergence is also examined for several realistic configurations and conditions typical of large-scale computational fluid dynamic analysis.}
}


@Article{Wan98:mpi-application,
author = {P. Wang},
title = {Massively parallel finite volume computation of three-dimensional thermal convective flows},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
pages = {307--315},
month = APR-JUL,
abstract = {A parallel implementation of the finite volume method for three-dimensional, time-dependent, thermal convective flows is presented. The algebraic equations resulting from the finite volume discretization are solved by a parallel multigrid method. A flexible parallel code has been implemented on distributed-memory systems, by using domain decomposition techniques and the MPI communication software. The code uses one-, two- or three-dimensional partition according to different geometries. It currently runs on the Intel Paragon, the Cray T3D, T3E, the IBM SP2 and the Beowulf systems, which can be ported easily to other parallel systems. A comparison of the wallclock time of the code between these systems is made, and code performances with respect to different numbers of processors are presented.}
}

@Article{Dan98:mpi-application,
author = {K. T. Danielson},
title = {Nonlinear dynamic finite element analysis on parallel computers using {FORTRAN} 90 and {MPI}},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
pages = {179--186},
month = APR-JUL,
abstract = {A nonlinear explicit dynamic finite element code for use on scalable computers is presented. The code was written entirely in FORTRAN 90, but uses MPI for all interprocessor communication. Although MPI is not formally a standard for FORTRAN 90, the code runs properly in parallel on CRAY T3E, IBM SP, and SGI ORIGIN 2000 computing systems. Issues regarding the installation, portability, and effectiveness of the FORTRAN 90-MPI combination on these machines are discussed. An algorithm that overlaps message passing and computations of the explicit finite element equations is also presented and evaluated. Several large-scale ground-shock analyses demonstrate the varying combined importance of load balance and interprocessor communication among the different computing platforms. The analyses were performed on only a few to hundreds of processors with excellent speedup and scalability.}
}


@Article{Vat98:mpi-application,
author = {V. N. Vatsa},
title = {Viscous pow computations for complex geometries on parallel computers},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
month = APR-JUL,
abstract = {A widely used computational fluid dynamics (CFD) code known as TLNS3D, which was developed for large, shared-memory computers, is ported to a distributed computing environment. An engineering approach is used here to parallelize this code so that minimal deviation from the original (non-parallel) code is incurred. A natural partitioning along grid blocks is adopted in which one or more blocks are distributed to each of the available processors. An automatic, static load-balancing strategy is employed for equitable distribution of computational work to specified processors. The message passing interface (MPI) protocols are incorporated for data communication. Both synchronous and asynchronous communication modes have been incorporated. As the number of processors is increased, the asynchronous communication mode shows much better scalability and clearly outperforms the synchronous mode of communication.}
}

@Article{Riv98:mpi-application,
author = {W. RiveraGallego},
title = {A genetic algorithm for circulant Euclidean distance matrices},
journal = {Applied Mathematics and Computation},
year = 1998,
volume = 97,
number = {2--3},
pages = {197--208},
month = DEC,
abstract = {This paper presents a fast genetic algorithm to determine three-dimensional configurations of points that generate circulant Euclidean Distance Matrices (EDMs). A parallel implementation is possible by using the message passing interface (MPI) standard. In addition, theoretical results about the polyhedral structure of both the cone of circulant symmetric positive semidefinite matrices and the cone of circulant EDMs are introduced.}
}

@Article{Ada98:mpi-application,
author = {P. Adamidis},
title = {Steel strip production --- a pilot application for coupled simulation with several calculation systems},
journal = {Journal of Materials Processing Technology},
year = 1998,
volume = {80-1},
pages = {330--336},
month = AUG-SEP,
abstract = {For the simulation of technological and natural processes in specific application domains, efficient calculation software solving differential equation systems on grid-based computational models is available, especially in the area of computer-aided engineering (CAE). To handle a so-called 'multiphysics' problem, for example the fluid flow and metal forming process in a twin-roll casting arrangement for steel strip production, several calculation systems usually have to be employed in a high-performance computing environment, e.g. on parallel computers. The GRISSLi Coupling Interface is a software tool facilitating the coupled computation based on the message passing standard MPI.}
}


@Article{Dow98:mpi-implementation,
author = {P. W. Dowd},
title = {{BLAST}: broadband lightweight {ATM} secure transport for high-performance distributed computing},
journal = {Computer Communications},
year = 1998,
volume = 21,
number = 12,
pages = {1040--1057},
month = AUG,
abstract = {This paper investigates the use of ATM for cluster-based computing. The need for a native ATM API is discussed as well as the performance of message passing libraries (MPL) that are written to use such an API to exploit the advantages of a high-speed network for cluster-based computing. The MPLs offer a standard interface, such as PVM or MPI, and interoperate with existing TCP/IP- and UDP/IP-based versions in addition to the ATM API environment. The interoperability extensions made to two MPLs, MPI and Prowess, which allow a hybrid environment of both ATM and TCP-based legacy network technology will be described. Shared object space (SOS), an extension to the MPLs, is described that helps support the geographically distributed computing (GDC) environment through latency hiding. It allows a user to develop applications in a shared memory type of environment. The native ATM API which supports cluster-based computing is described in this paper. This API provides a reliable transport interface to the MPL which has been optimized for an ATM environment. The transport protocol is a low-state design that optimizes the performance based on the available bandwidth, buffer constraints, propagation delay characteristics and security requirements of a particular connection.}
}

@Article{Kac98:mpi-tool,
author = {P. Kacsuk},
title = {{GRADE}: A graphical programming environment for multicomputers},
journal = {Computers and Artificial Intelligence},
year = 1998,
volume = 17,
number = 5,
pages = {417--427},
abstract = {To provide high-level graphical support for developing message passing programs, an integrated programming environment (GRADE) is being developed. GRADE currently provides tools to construct, execute, debug, monitor and visualise message-passing based parallel programs. GRADE offers the programmer an integrated graphical user interface during the whole life-cycle of program development and provides high-level graphical programming abstraction mechanisms to construct parallel applications. The current version of GRADE can generate C+PVM code but there is no theoretical obstacle to extend it for supporting MPI [9] and FORTRAN. Those new features of the GRADE graphical environment are described in the paper that enhanced GRADE towards a professional parallel programming environment.}
}

@Article{Ras98:mpi-application,
author = {J. Rasch},
title = {6-dimensional integrals and supercomputers},
journal = {Computer Physics Communications},
year = 1998,
volume = 114,
number = {1--3},
pages = {378--384},
month = NOV,
abstract = {Recently, a numerical method has been developed for the evaluation of general 6-dimensional integrals (6DIME), which has been successfully applied to the study of (e,2e) and (gamma,2e) processes. Details of the parallelization of that code are given using MPI and the scaling behaviour with respect to the number of nodes is presented. Almost full load balancing is obtained.The method is extended to include two centre scattering problems.}
}

@Article{Chu98:mpi-balancing,
author = {Y. Chung},
title = {An asynchronous algorithm for balancing unpredictable workload on distributed-memory machines},
journal = {ETRI Journal},
year = 1998,
volume = 20,
number = 4,
pages = {346--360},
month = DEC,
abstract = {It is challenging to parallelize problems with irregular computation and communication. In this paper, we propose an asynchronous algorithm for balancing unpredictable workload on distributed-memory machines. By using an initial workload estimate, we first partition the computations such that the workload is distributed evenly across the processors. In addition, we performtask migrations dynamically for adapting to the evolving workload. To demonstrate the usefulness of our load balancing strategy, we conducted experiments on an IBM SP2 and a Cray T3D. Experimental results show that our task migration strategy can balance unpredictable workload with little overhead. Our code using C and MPI is portable onto other distributed-memory machines.}
}

@Article{Ber99:mpi-tools,
author = {M. Bertozzi},
title = {Tools for code optimization and system evaluation of the image processing system {PAPRICA-3}},
journal = {Journal of Systems Architecture},
year = 1999,
volume = 45,
number = {6--7},
pages = {519--542},
month = JAN,
abstract = {This paper presents the complex environment that was built to ease the prototyping of real-time applications on the PAPRICA-3 massively parallel system. Applications are developed in C++ using high level data types and the corresponding Assembly code is automatically created by a code generator. A stochastic code optimizer takes the assembly code and improves it according to a genetic approach; due to the high computational power required by thisapproach, the stochastic code optimizer was implemented with MPI and runs in parallel on a cluster of workstations. The availability of this complex environment allowed to test the performance of the system and to tune it according to some target applications before the actual development of the hardware. For this purpose a system-level simulator was also built to determine the number of clock cycles required to run a specific segment of code. The whole environment has been used to validate possible solutions for the hardware system and to develop, test, and tune several real-time image processing applications. The hardware system is now completely defined.}
}

@Article{Lee99:mpi-applicatin,
author = {P. C. S. Lee},
title = {On the parallelization of a global climate-chemistry modeling system},
journal = {Atmospheric Environment},
year = 1999,
volume = 33,
number = 4,
pages = {675--681},
month = FEB,
abstract = {Coupled climate-chemistry simulations are computationally intensive owing to the spatial and temporal scope of the problem. In global chemistry models, the time integrations encountered in the chemistry and aerosol modules usually comprise the major CPU consumption. Parallelization of these segmentsof the code can contribute to multifold CPU speed-ups with minimal modification of the original serial code. This technical note presents a single program-multiple data (SPMD) strategy applied to the time-split chemistry modules of a coupled climate - global tropospheric chemistry model. Latitudinal domain decomposition is adopted along with a dynamic load-balancing technique that uses the previous time-step's load/latitude estimates for distributing the latitude bands amongst the processors. The coupled model is manually parallelized using the Message Passing Interface standard (MPI) on a distributed memory platform (IBM-SP2), Load-balancing efficiencies and the associated MPI overheads are discussed. Overall speed-ups and efficiencies are also calculated for a series of runs employing up to eight processors.}
}

@Article{May99:mpi-application,
author = {F. May},
title = {Mathematical modelling of glass melting furnace design with regard to {NOx} formation},
journal = {Glastechnische Berichte-Glass Science and Technology},
year = 1999,
volume = 72,
number = 1,
pages = {1--6},
month = JAN,
abstract = {A three-dimensional mathematical model for turbulent flow and combustion onthe basis of turbulence/chemistry interactions and radiative heat transfertaking into account spectral effects of surrounding walls and combustion gases is described. For this the transport equation for radiative intensity was split into different wavelength ranges. A block-structured finite volume grid with local refinements was used to solve the governing equations. The calculation domain is subdivided into a number of subdomains which are linked within the solver based on the Message Passing Interface library. Computed distributions of velocity, temperature, and heat fluxes are given. Results of a parametric study in a producing horseshoe furnace by increasing the height of the furnace with regard to NOx concentration distributions are presented.}
}

@Article{Reu99:mpi-application,
author = {J. Reuther},
title = {Aerodynamic shape optimization of supersonic aircraft configurations via anadjoint formulation on distributed memory parallel computers},
journal = {Computers and Fluids},
year = 1999,
volume = 28,
number = {4--5},
pages = {675--700},
month = MAY-JUN,
abstract = {This work describes the application of a control theory-based aerodynamic shape optimization method to the problem of supersonic aircraft design. A high fidelity computational fluid dynamics (CFD) algorithm modelling the Euler equations is used to calculate the aerodynamic properties of complex three-dimensional aircraft configurations. The design process is greatly accelerated through the use of both control theory and parallel computing. Control theory is employed to derive the adjoint differential equations whose solution allows for the evaluation of design gradient information at a fraction of the computational cost required by previous design methods. The resulting problem is then implemented in parallel using a domain decomposition approach, an optimized communication schedule, and the Message Passing Interface (MPI) Standard for portability and efficiency. In our earlier studies, the serial implementation of this design method, was shown to be effective for the optimization of airfoils, wings, wing-bodies, and complex aircraft configurations using both the potential equation and the Euler equations. In this work, our concern will be to extend the methodologies such that the combined capabilities of these new technologies can be used routinely and efficiently in an industrial design environment. The aerodynamic optimization of a supersonic transport configuration is presented as a demonstration test case of the capability, A particular difficulty of this test case is posed by the close coupling of the propulsion/airframe integration.}
}

@Article{Vat99:mpi-application,
author = {V. N. Vatsa},
title = {Parallelization of a multiblock flow code: an engineering implementation},
journal = {Computers and Fluids},
year = 1999,
volume = 38,
number = {4--5},
pages = {603--614},
month = MAY-JUN,
abstract = {Current trends in computer hardware are dictating a gradual shift toward the use of clusters of relatively inexpensive but powerful workstations, or massively parallel processing (MPP) machines, for scientific computing. However, most computational fluid dynamics (CFD) codes in use today were developed for large, shared-memory machines and are not readily portable to the distributed computing environment. One major hurdle in porting CFD codes to distributed computing platforms is the difficulty encountered in partitioning the problem so that the computation-to-communication ratio for each compute node (process) is maximized and the idle time during which one node waits for other nodes to transfer data is minimized. In the present work, pertinent issues involved in the parallelization of a widely used multiblock Navier-Stokes code TLNS3D are discussed. An engineering; approach is used here to parallelize this code so that minimal deviation from the original (nonparallel) code is incurred. A natural partitioning along grid blocks is adopted in which one or more blocks are distributed to each of the available nodes. An automatic, static load-balancing strategy is employed for equitable distribution of computational work to specified nodes. Both parallel Virtual machine (PVM) and message passing interface (MPI) protocols are incorporated for data communication to allow maximum portability to a wide range of computer configurations. Results are presented that are comparable with apriori estimates of performance for distributed computing and that are competitive in terms of central processing unit (CPU) time and wall time usagewith large, shared-memory supercomputers.}
}

@Article{Dzw99:mpi-application,
author = {W. Dzwinel},
title = {Method of particles in visual clustering of multi-dimensional and large data sets},
journal = {Future Generation Computer Systems},
year = 1999,
volume = 15,
number = 3,
pages = {365--379},
month = APR,
abstract = {A method dedicated for visual clustering of N-dimensional data sets is presented. It is based on the classical feature extraction technique - the Sammon's mapping. This technique empowered by a particle approach used in the Sammon's criterion minimization makes the method more reliable, general and efficient. To show its reliability, the results of tests are presented, which were made to exemplify the algorithm 'immunity' from data errors. The general character of the method is emphasized and its role in multicriterial analysis discussed. Due to inherent parallelism of the methods, which are based on the particle approach, the visual clustering technique can be implemented easily in parallel environment. It is shown that parallel realization of the mapping algorithm enables the visualization of data sets consisting of more than 10(4) multi-dimensional data points. The method was tested in the PVM, MPI and data parallel environments on an HP/Convex SPP/1600. In this paper, the authors compare the parallel algorithm performance for these three interfaces. The approach to visual clustering, presented in the paper, can be used in visualization and analysis of large multi-dimensional data sets. }
}

@Article{Wan99:mpi-application,
author = {P. Wang},
title = {Parallel multigrid finite volume computation of three-dimensional thermal convection},
journal = {Computers and Mathematics with Applications},
year = 1999,
volume = 37,
number = 9,
pages = {49-60},
month = MAY,
abstract = {A parallel implementation of the finite volume method for three-dimensional, time-dependent, thermal convective flows is presented. The algebraic equations resulting from the finite volume discretization, including a pressureequation which consumes most of the computation time, are solved by a parallel multigrid method. A flexible parallel code has been implemented on theIntel Paragon, the Cray T3D, and the IBM SP2 by using domain decompositiontechniques and the MPI communication software. The code can use 1D, 2D, or3D partitions as required by different geometries, and is easily ported toother parallel systems. Numerical solutions for air (Prandtl number Pr = 0.733) with various Rayleigh numbers up to 10(7) are discussed.}
}


@Article{Bar99:mpi-application,
author = {S. T. Barnard},
title = {An {MPI} implementation of the {SPAI} preconditioner on the {T3E}},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 2,
pages = {107--123},
month = {Summer},
abstract = {The authors describe and test spai-1.1, a parallel MPI implementation of the sparse approximate inverse (SPAI) preconditioner. They show that SPAI canbe very effective for solving a set of very large and difficult problems on a Cray T3E. The results clearly show the value of SPAI (and approximate inverse methods in general) as the Viable alternative to ILU-type methods when facing very large and difficult problems. The authors strengthen this conclusion by showing that spai-1.1 also has very good scaling behavior.}
}

@Article{Ree99:mpi-application,
author = {J. S. Reeve},
title = {An efficient parallel version of the {Householder-QL} matrix diagonalisation algorithm},
journal = {Parallel Computing},
year = 1999,
volume = 25,
number = 3,
pages = {311-319},
month = MAR,
abstract = {In this paper we report an effective parallelisation of the Householder routine for the reduction of a real symmetric matrix to tri-diagonal form and the QL algorithm for the diagonalisation of the resulting matrix. The Householder algorithm scales like alpha N-3/P + beta N(2)log(2)(P) and the QL algorithm like gamma N-2 + delta N-3/P as the number of processors P is increased for fixed problem size. The constant parameters alpha, beta, gamma anddelta are obtained empirically. When the eigenvalues only are required theHouseholder method scales as above while the QL algorithm remains sequential. The code is implemented in c in conjunction with the message passing interface (MPI) libraries and verified on a sixteen node IBM SP2 and for realmatrices that occur in the simulation of properties of crystaline materials.}
}

@Article{Gen99:mpi-application,
author = {C. Gennaro},
title = {Parallelising the Mean Value Analysis algorithm},
journal = {Transactions of the Society for Computer Simulation International},
year = 1999,
volume = 16,
number = 1,
pages = {16--22},
month = MAR,
abstract = {The Mean Value Analysis (MVA) algorithm is one of the most popular for evaluating the performance of separable (or product-form) queueing networks. Although its complexity is modest when jobs are indistinguishable, the introduction of different customer classes rapidly increases is computational cost. The problems of parallelising the algorithm while retaining its conceptual simplicity are examined. In particular, a parallel implementation of MVAon a distributed memory machine is developed using the MPI library for communication.}
}

@Article{Ble99:mpi-application,
author = {G. E. Blelloch},
title = {Design and implementation of a practical parallel {D}elaunay algorithm},
journal = {Algorithmica},
year = 1999,
volume = 24,
number = {3--4},
pages = {243--269},
month = JUL-AUG,
abstract = {Initial experiments using a variety of distributions showed that our parallel algorithm was within a factor of 2 in work from the best sequential algorithm. Based on these promising results, the algorithm was implemented using C and an MPI-based toolkit. Compared with previous work, the resulting implementation achieves significantly better speedups over good sequential code, does not assume a uniform distribution of points, and is widely portable due to its use of MPI as a communication mechanism. Results are presentedfor the IBM SP2, Cray T3D, SGI Power Challenge, and DEC AlphaCluster.}
}

@Article{Coe99:mpi-application,
author = {P. J. Coelho},
title = {Modelling of a utility boiler using parallel computing},
journal = {Journal of Supercomputing},
year = 1999,
volume = 13,
number = 2,
pages = {211-232},
month = MAR,
abstract = {A mathematical model for the simulation of the turbulent reactive flow and heat transfer in a power station boiler has been parallelized. The mathematical model is based on the numerical solution of the governing equations for mass, momentum, energy and transport equations for the scalar quantities.The k-epsilon model and the conserved scalar/prescribed probability density function formalism are employed. Radiative heat transfer is calculated using the discrete ordinates method. The code has been fully parallelized using the spatial domain decomposition approach and MPI. Calculations were performed using an IBM-SP2. It is shown that the computational requirements are reduced and the parallel efficiency increases if the mean temperature anddensity are calculated a priori, and stored. The role of the different parts of the code on the parallel performance is discussed. A speedup of 5.9 is achieved using 8 processors.}
}

@Article{Rus99:mpi-cluster,
author = {S. H. Russ},
title = {Using {Hector} to run {MPI} programs over networked workstations},
journal = {Concurrency Practice and Experience},
year = 1999,
volume = 11,
number = 4,
pages = {189--204},
month = APR,
abstract = {Networked workstations represent an increasingly popular distributed platform for running large parallel programs. They can present a low-cost alternative to purchasing supercomputer time or additional usable computational capability, Several capabilities are desirable in order to harness workstations, including support for a widely accepted parallel programming environment, task migration, intelligent resource allocation, fault tolerance, and totally transparent support of these features. The Hector system is designed to provide these capabilities to MPI programs. The structure of the system and experiences using the system on loaded workstations to run scientific codes are described.}
}

@Article{Ros99:mpi-tool,
author = {T. Rossi},
title = {SIAM Journal on Scientific Computing},
journal = {A parallel fast direct solver for block tridiagonal systems with separable matrices of arbitrary dimension},
year = 1999,
volume = 20,
number = 5,
pages = {1778-1796},
month = MAY,
abstract = {A parallel fast direct solution method for linear systems with separable block tridiagonal matrices is considered. Such systems appear, for example, when discretizing the Poisson equation in a rectangular domain using the five-point finite difference scheme or the piecewise linear finite elements ona triangulated, possibly nonuniform rectangular mesh. The method under consideration has the arithmetical complexity O(N log N), and it is closely related to the cyclic reduction method, but instead of using the matrix polynomial factorization, the so-called partial solution technique is employed. Hence, in this paper, the method is called the partial solution variant of the cyclic reduction method (PSCR method). The method is presented and analyzed in a general radix-q framework and, based on this analysis, the radix-4 variant is chosen for parallel implementation using the MPI standard. Thegeneralization of the method to the case of arbitrary block dimension is described. The numerical experiments show the sequential efficiency and numerical stability of the PSCR method compared to the well-known BLKTRI implementation of the generalized cyclic reduction method. The good scalability properties of the parallel PSCR method are demonstrated in a distributed-memory Cray T3E-750 computer.}
}

@Article{Bou99:mpi-algorithm,
author = {P. Boulet},
title = {Static tiling for heterogeneous computing platforms},
journal = {Parallel Computing},
year = 1999,
volume = 25,
number = 5,
pages = {547--568},
month = MAY,
abstract = {In the framework of fully permutable loops, tiling has been extensively studied as a source-to-source program transformation. However, little work hasbeen devoted to the mapping and scheduling of the tiles on physical processors. Moreover, targeting heterogeneous computing platforms has to the best of our knowledge, never been considered. In this paper we extend static tiling techniques to the context of limited computational resources with different-speed processors. In particular, we present efficient scheduling and mapping strategies that are asymptotically optimal. The practical usefulness of these strategies is fully demonstrated by MPI experiments on a heterogeneous network of workstations.}
}

@Article{Ros99:mpi-application,
author = {I. Rosenblum},
title = {Multi-processor molecular dynamics using the {Brenner} potential: Parallelization of an implicit multi-body potential},
journal = {International Journal of Modern Physics C},
year = 1999,
volume = 10,
number = 1,
pages = {189--203},
month = FEB,
abstract = {We present computational aspects of Molecular Dynamics calculations of thermal properties of diamond using the Brenner potential. Parallelization was essential in order to carry out these calculations on samples of suitable sizes. Our implementation uses MPI on a multi-processor machine such as the IBM SP2. Three aspects of parallelization of the Brenner potential are discussed in depth. These are its long-range nature, the need for different parallelization algorithms for forces and neighbors, and the relative expense of force calculations compared to that of data communication. The efficiency of parallelization is presented as a function of different approaches to these issues as well as of cell size and number of processors employed in the calculation. In the calculations presented here, information from almosthalf of the atoms were needed by each processor even when 16 processors were used. This made it worthwhile to avoid unnecessary complications by making data from all atoms available to all processors. Superlinear speedup wasachieved for four processors (by avoiding paging) with 512 atom samples, and 5ps long trajectories were calculated (for 5120 atom samples) in 53 hours using 16 processors; 514 hours would have been needed to complete this calculation using a serial program. Finally, we discuss and make available a set of routines that enable MPI-based codes such as ours to be debugged on scalar machines.}
}

@Article{Luo99:mpi-comparision,
author = {Y. Luo},
title = {Shared memory vs. message passing: the {COMOPS} benchmark experiment},
journal = {Journal of Supercomputing},
year = 1999,
volume = 13,
number = 3,
pages = {283--301},
month = MAY,
abstract = {This paper presents the comparison of the COMOPS benchmark performance in MPI and shared memory on four different shared memory platforms: the DEC AlphaServer 8400/300, the SGI Power Challenge, the SGI Origin2000, and the HP-Convex Exemplar SPP1600. The paper also qualitatively analyzes the obtained performance data based on an understanding of the corresponding architecture and the MPI implementations. Some conclusions are made for the inter-processor communication performance on these four shared memory platforms.}
}


@Article{Hio99:mpi-application,
author = {S. Hioki},
title = {{QCDimMPI: MPI} code for {QCD} with an improved action},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 1999,
volume = 73,
pages = {895--897},
month = MAR,
abstract = {QCDimMPI[I] is a simulation code for pure SU(3) gauge theory with an improved action consisting of 1 x 1 and 2 x 1 plaquettes. It uses Fortran77 and the Message Passing Interface Standard, MPI[2]. QCDimMPI is an extended version of QCDMPI. It is portable, allows simulations in any number of dimensions, on any number of processors, and with arbitrary dimensional partitioning. It requires a rather small working area, and yields excellent performance on single processor computers and a wide variety of parallel computers which support MPI. The program provides information on link update time and communications time. In this paper, an outline of QCDimMPI is given, and benchmark results on several parallel computers are reported.}
}


@Article{Gol99:mpi-application,
author = {A. Goller},
title = {Parallel processing strategies for large {SAR} image data sets in a distributed environment},
journal = {Computing},
year = 1999,
volume = 62,
number = 4,
pages = {277-291},
abstract = {Key algorithms like image matching and Shape-from-Shading were parallelizedmainly using MPI, and ported onto suitable computer architectures. Our experiments showed that all algorithms perform well, and they further proved the concept of CDIP to be beneficial: Usability of all integrated algorithmswas significantly improved, mainly due to less user-centered network traffic, simple access to supercomputers, the creation of method sequences, and easy-to-use and well maintained algorithms.}
}

@Article{Chi99:mpi-implementation,
author = {A. Chien},
title = {Design and evaluation of an {HPVM}-based windows {NT} supercomputer},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 3,
pages = {201--219},
month = {Fall},
abstract = {We describe the design and evaluation of a 192-processor Windows NT clusterfor high performance computing based on the High Performance Virtual Machine (HPVM) communication suite. While other clusters have been described in the literature, building a 58 GFlop/s NT cluster to be used as a general-purpose production machine for NCSA required solving new problems. The HPVM software meets the challenges represented by the large number of processors,the peculiarities of the NT operating system, the need for a production-strength job submission facility and the requirement for mainstream programming interfaces. First, HPVM provides users with a collection of standard APIs like MPI, Shmem, Global Arrays with supercomputer class performance (13 mu s minimum latency, 84 MB/s peak bandwidth for MPI), efficiently delivering Myrinet's hardware performance to application programs. Second, HPVM provides cluster management and scheduling (through integration with Platform Computing's LSF). Finally, HPVM addresses Windows NT's remote access problem, providing convenient remote access and job control (through a graphical Java-applet front-end). Given the production nature of the cluster, the performance characterization is largely based on a sample of the NCSA scientific applications the machine will be running. The side-by-side comparison with other present-generation NCSA supercomputers shows the cluster to be within a factor of 2 to 4 of the SGI Origin 2000 and Cray T3E performance at a fraction of the cost. The inherent scalability of the cluster design produces a comparable or better speedup than the Origin 2000 despite a limitationin the HPVM flow control mechanism.}
}


@Article{Ros99:mpi-tools,
author = {T. Rossi},
title = {Parallel fictitious domain method for a non-linear elliptic {Neumann} boundary value problem},
journal = {Numerical Linear Algebra with Applications},
year = 1999,
volume = 6,
number = 1,
pages = {51--60},
month = JAN-FEB,
abstract = {Parallelization of the algebraic fictitious domain method is considered forsolving Neumann boundary value problems with variable coefficients. The resulting method is applied to the parallel solution of the subsonic full potential flow problem which is linearized by the Newton method. Good scalability of the method is demonstrated on a Cray T3E distributed memory parallel computer using MPI in communication.}
}


@Article{Zak99:mpi-tools,
author = {O. Zaki},
title = {Toward scalable performance visualization with Jumpshot},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 3,
pages = {277-288},
month = {Fall},
abstract = {Jumpshot is a graphical tool for understanding the performance of parallel programs. It is in the tradition of the upshot tool but contains a number of extensions and enhancements that make it suitable for large-scale parallel computations. Jumpshot takes as input a new, more flexible logfile formatand comes with a library for generating such logfiles. An MPI profiling library is also included, enabling the automatic generation of such logfiles from MPI programs. Jumpshot is written in Java and can easily be integratedas an applet into browser-based computing environments. The most novel feature of Jumpshot is its automatic detection of anomalous durations, drawingthe user's attention to problem areas in a parallel execution. This capability is particularly useful in large-scale parallel computations containingmany events.}
}

@Article{BegVin99:transport,
author = {S. Bergeron and A. Vincent},
title = {Implementation strategies for real-time particle transport solver},
journal = {Computer Physics Communications},
year = 1999,
volume = 120,
number = {2--3},
month = AUG,
pages = {177-184},
abstract = {Many problems in physics and engineering involve the transport of solid particles in a turbulent field. In some cases, it is desirable to study the transport of those particles in "real time". The prediction of erosion in therotating part of hydraulic turbines is such a problem. This paper presentsa semi-analytic predictor-corrector scheme adapted to the case of a rotating frame of reference. Simplification, related to the interpolation scheme required, is discussed as well as a parallel implementation using MPI on 10Base-T Ethernet interconnected workstations. The 3D solver is coupled with a high performance visualization software. Performance then shows a quasi-linear speedup.}
}



@Article{BruFagRes99:meta,
author = {M. A. Brune and G. E. Fagg and M. M. Resch},
title = {Message-passing environments for metacomputing},
journal = {Future Generation Computer Systems},
year = 1999,
volume = 15,
number = {5--6},
month = OCT,
pages = {699-712},
abstract = {The PACX-MPI approach offers a transparent interface for the communication between two or more MPI environments. PVAMPI allows the user spawning parallel processes under the MPI environment. The PLUS protocol bridges the gap between vendor-specific (e.g., MPL, NX, and PARIX) and vendor-independent message-passing environments (e.g., PVM and MPI). Moreover, it offers the ability to create and control processes at application runtime.}
}

@Article{ResRanSto99:meta,
author = {M. M. Resch and D. Rantzau and R. Stoy},
title = {Metacomputing experience in a transatlantic wide area application test-bed},
journal = {Future Generation Computer Systems},
year = 1999,
volume = 15,
number = {5--6},
month = OCT,
pages = {807--816},
abstract = {In the frame of a G7 initiative the High Performance Computing Center Stuttgart (HLRS) together with the Pittsburgh Supercomputing Center (PSC) and Sandia National Laboratories (SNL) has set up a transatlantic wide area application test-bed in 1997. A dedicated ATM-Link was installed that connected German research networks to vBNS and ESnet. During 1 year this test-bed wasextensively used for metacomputing and collaborative working. Two applications - one from computational fluid dynamics and one from molecular dynamics - were adapted and run on the test-bed. For message-passing an MPI library was implemented that supports metacomputing. An already existing softwarefor collaborative visualization was adapted for that scenario. This article describes the technical background of the cooperation, results that have been achieved for the two applications so far and lessons that have been learned. Special emphasis will be given to future work planned.}
}


@Article{Tho99:mpi-application,
author = {S. J. Thomas and M. Desgagne and R. Benoit},
title = {A real-time north American forecast at 10-km resolution with the {C}anadian {MC2 Meso-LAM}},
journal = {Journal of Atmospheric and Oceanic Technology},
year = 1999,
volume = 16,
number = 8,
pages = {1092-1101},
month = AUG,
abstract = {The next generation of high-performance computers will be based on clustersof shared-memory symmetric multiprocessor (SMP) nodes interconnected by a low-latency, high-bandwidth network. In this paper, the parallel performance of the nonhydrostatic Mesoscale Compressible Community (MC2) limited-areaatmospheric model on clusters of NEC SX-4 symmetric multiprocessor (SMP) nodes is presented. Several hybrid parallel-programming approaches are now possible with the SMP cluster SC-MC2 implementation based on internode MPI message-passing and intranode shared-memory tasking or threads. At total sustained execution rates of between 25 and 30 Gflop s(-1) on single-node or multinode clusters, it is now possible for the first time ever to generate a24-48-h real-time weather forecast over North America at 10-km resolution.}
}


@Article{Rod99:mpi-evals,
author = {J. L. Roda and C. Rodriguez and D. G. Morales and E. Almeida},
title = {Predicting the execution time of message passing models},
journal = {Concurrency Practice and Experience},
year = 1999,
volume = 11,
number = 9,
month = AUG,
pages = {461--477},
abstract = {Recent publications prove that runtime systems oriented to the Bulk Synchronous Parallel Model usually achieve remarkable accuracy in their predictions, That accuracy can be seen in the capacity of the software for packing the messages generated during the superstep and their capability to find a rearrangement of the messages sent at the end of the superstep, Unfortunately, barrier synchronisation imposes some limits both in the range of available algorithms and in their performance, The asynchronous nature of many MPI/PVM programs makes their expression difficult or infeasible using a BSP oriented library. Through the generalisation of the concept of superstep we propose two extensions of the BSP model: the BSP Without Barriers (BSPWB) andthe Message Passing Machine (MPM) models, These new models are oriented toMPI/PVM parallel programming. The parameters of the models and their quality are evaluated on four standard parallel platforms, The use of these BSP extensions is illustrated using the Past Fourier Transform and the ParallelSorting by Regular Sampling algorithms.}
}

@Article{Lir99:mpi-apps,
author = {I. Lirkov and S. Margenov},
title = {{MPI} parallel implementation of {CBF} preconditioning for {3D} elasticity problems},
journal = {Mathematics and Computers in Simulation},
year = 1999,
volume = 50,
number = {1--4},
month = NOV,
pages = {247--254},
abstract = {New construction of a parallel algorithm for the discussed preconditioning method is proposed. The theoretical part of this study includes analysis ofthe execution time on various parallel architectures and asymptotic estimates of the parallel speedup and the parallel efficiency. The parallel performance estimates indicate that the proposed algorithm will be especially efficient on coarse-grain parallel systems, which is also confirmed by the numerical experiments. A portable MPI parallel code is developed. Numerical tests on three symmetric multiprocessor systems: SUN Enterprise 3000, SUN SPARCstation 10 and Origin 2000 are presented. The reported speedup and parallel efficiency illustrate well the features of the proposed method and its implementation. }
}

@Article{den99:mpi-app,
author = {L. Deng and Z. S. Xie},
title = {Parallelization of {MCNP} Monte Carlo neutron and photon transport code in parallel virtual machine and message passing interface},
journal = {Journal of Nuclear Science and Technology},
year = 1999,
volume = 36,
number = 7,
month = JUL,
abstract = {The coupled neutron and photon transport Monte Carlo code MCNP (version 3B)has been parallelized in parallel virtual machine (PVM) and message passing interface (MPI) by modifying a previous serial code. The new code has been verified by serving sample problems. The speedup increases linearly with the number of processors and the average efficiency is up to 99\% for 12-processor.}
}

@Article{Arp99:mpi-app,
author = {K. Arpe and E. Roechner},
title = {Simulation of the hydrological cycle over Europe: Model validation and impacts of increasing greenhouse gases},
journal = {Advances in Water Resources},
year = 1999,
volume = 23,
number = 2,
month = OCT,
pages = {105--119},
abstract = {Different methods of estimating precipitation area means, based on observations, are compared with each other to investigate their usefulness for model validation. For the applications relevant to this study the ECMWF reanalyses provide a good and comprehensive data set for validation. The uncertainties of precipitation analyses, based on observed precipitation or from numerical weather forecasting schemes, are generally in the range of 20\% but regionally much larger. The MPI atmospheric general circulation model is able to reproduce long term means of the main features of the hydrological cycle within the range of uncertainty of observational data, even for relatively small areas such as the Rhine river basin. Simulations with the MPI coupled general circulation model, assuming a further increase of anthropogenicgreenhouse gases, show clear trends in temperature and precipitation for the next century which would have significant implications for human activity, e.g. a further increase of the sea level of the Caspian Sea and less water in the Rhine and the Danube. We have gained confidence in these results because trends in the temperature and precipitation in the coupled model simulations up to the present are partly confirmed by an atmospheric model simulation forced with observed SSTs and by observational data. We gained further confidence because the simulations with the same coupled model but using constant greenhouse gases do not show such trends. However, doubts arisefrom the fact that these trends are strong where the systematic errors of the model are large.}
}

@Article{Yah99:mpi-app,
author = {Y. Yahagi and M. Mori and Y. Yoshii},
title = {The forest method as a new parallel tree method with the sectional Voronoi tessellation},
journal = {Astrophysical Journal Supplement Series},
year = 1999,
volume = 124,
number = 1,
month = SEP,
pages = {1--9},
abstract = {We have developed a new parallel tree method which will be called the forest method hereafter. This new method uses the sectional Voronoi tessellation(SVT) for the domain decomposition. The SVT decomposes a whole space into polyhedra and allows their flat borders to move by assigning different weights. The forest method determines these weights based on the load balancingamong processors by means of the overload diffusion (OLD). Moreover, sinceall the borders are hat, before receiving the data from other processors, each processor can collect enough data to calculate the gravity force with precision. Both the SVT and the OLD are coded in a highly vectorizable manner to accommodate on vector parallel processors. The parallel code based onthe forest method with the Message Passing Interface is run on various platforms so that a wide portability is guaranteed. Extensive calculations with 15 processors of Fujitsu VPP300/16R indicate that the code can calculate the gravity force exerted on 10(5) particles in each second for some ideal dark halo. This code is found to enable an N-body simulation with 10(7) or more particles for a wide dynamic range and is therefore a very powerful tool for the study of galaxy formation and large-scale structure in the universe.}
}

@Article{tan99:mpi-impl,
author = {H. Tang and K. Shen and T. Yang},
title = {Compile/run-time support for threaded {MPI} execution on multiprogrammed shared memory machines},
journal = {ACM SIGPLAN Notices},
year = 1999,
volume = 34,
number = 8,
month = AUG,
pages = {107--118},
abstract = {MPI is a message-passing standard widely used for developing high-performance parallel applications. Because of the restriction in the MPI computationmodel, conventional implementations on shared memory machines map each MPInode to an OS process, which suffers serious performance degradation in the presence of multiprogramming, especially when a space/time sharing policyis employed in OS job scheduling In this paper, we study compile-time and run-time support for MPI by using threads and demonstrate our optimization techniques for executing a large class of MPI programs written in C. The compile-time transformation adopts thread-specific data structures to eliminate the use of global and static variables in C code. The run-time support includes an efficient point-to-point communication protocol based on a novellock-free queue management scheme. Our experiments on an SGI Origin 2000 show that our MPI prototype called TMPI using the proposed techniques is competitive with SGI's native MPI implementation in a dedicated environment, and it has significant performance advantages with up to a 23-fold improvement in a multiprogrammed environment.}
}

@Article{kie99:mpi-collective,
author = {T. Kielmann and R. F. H. Hofman and H. E. Bal and A. Plaat and R. A. F. Bhoedjang},
title = {{MAGPIE: MPI}'s collective communication operations for clustered wide area systems},
journal = {ACM SIGPLAN Notices},
year = 1999,
volume = 34,
number = 8,
month = AUG,
pages = {131-140},
abstract = {Writing parallel applications for computational grids is a challenging task. To achieve good performance, algorithms designed for local area networks must be adapted to the differences in link speeds. An important class of algorithms are collective operations, such as broadcast and reduce. We have developed MAGPIE, a library of collective communication operations optimizedfor wide area systems. MAGPIE's algorithms send the minimal amount of dataover the slow wide area links, and only incur a single wide area latency. Using our system, existing MPI applications can be run unmodified on geographically distributed systems. On moderate cluster sizes, using a wide area latency of 10 milliseconds and a bandwidth of 1 MByte/s, MAGPIE executes operations up to 10 times faster than MPICH, a widely used MPI implementation; application kernels improve by up to a factor of 4. Due to the structure of our algorithms, MAGPIE's advantage increases for higher wide area latencies.}
}


@Article{zhu99:mpi-app,
author = {W. J. Zhu and L. Petzold},
title = {Parallel sensitivity analysis for {DAE}s with many parameters},
journal = {Concurrency-Practice and Experience},
year = 1999,
volume = 11,
number = 10,
month = AUG,
pages = {571--585},
abstract = {In this paper, we discuss the parallel computation of the sensitivity analysis of systems of differential-algebraic equations (DAEs) with a moderate number of state variables and a large number of sensitivity parameters, Several parallel implementations based on DASSLSO are explored and their performance when using the Message Passing Interface (MPI) on an SGI Origin 2000 is compared, }
}

@Article{Sun99:mpi-perf,
author = {D. Sundaram-Stukel and M. K. Vernon},
title = {Predictive analysis of a wavefront application using {LogGP}},
journal = {ACM SIGPLAN Notices},
year = 1999,
volume = 34,
number = 8,
month = AUG,
pages = {141-150},
abstract = {This paper develops a highly accurate LogGP model of a complex wavefront application that uses MPI communication on the IBM SP/2. Key features of the model include: (1) elucidation of the principal wavefront synchronization structure, and (2) explicit high-fidelity models of the MPI-send and MPI-receive primitives. The MPI-send/receive models are used to derive L, o, and Gfrom simple two-node micro-benchmarks, Other model parameters are obtainedby measuring small application problem sizes on four SP nodes. Results show that the LogGP model predicts, in seconds and with a high degree of accuracy, measured application execution time for large problems running on 128 nodes. Detailed performance projections are provided for very large future processor configurations that are expected to be available to the application developers. These results indicate that scaling beyond one or two thousand nodes yields greatly diminished improvements in execution time, and thatsynchronization delays are a principal factor limiting the scalability of the application.}
}

@Article{kimura99:mpi-app,
author = {T. Kimura and H. Takemiya},
title = {Distributed parallel computing for fluid structure coupled simulations on a heterogeneous parallel computer cluster},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 4,
pages = {320--333},
abstract = {Distributed parallel computing for a fluid-structure coupled simulation hasbeen performed on a heterogeneous parallel computer cluster. The fluid andthe structure dynamics are simulated on different parallel computers connected by a high-speed local network. These dynamics are coupled by a loose coupling method exchanging the boundary data between the fluid and the structure domains through the network. The data communication among parallel computers is realized by using the new communication library, Stampi, which has been developed to enable communication in a heterogeneous environment. The performance evaluation on a heterogeneous parallel computer cluster has shown that the distributed parallel computing for fluid-structure coupled simulations has the advantage of increasing the performance compared with theparallel computing on a single parallel computer.}
}


@Article{morrow99:mpi-app,
author = {P. J. Morrow and D. Crookes and J. Brown and G. McAleese and D. Roantree and I. Spence},
title = {Efficient implementation of a portable parallel programming model for image processing},
journal = {Concurrency-Practice and Experience},
year = 1999,
volume = 11,
number = 11,
month = SEP,
pages = {671--685},
abstract = {This paper describes a domain specific programming model for execution on parallel and distributed architectures. The model has initially been targeted at the application area of image processing, though the techniques developed may be more generally applicable to other domains where an algebraic orlibrary-based approach is common. Efficiency is achieved by the concept ofa self-optimising class library of primitive image processing operations, which allows programs to be written in a high level, algebraic notation andwhich is automatically parallelised (using an application-specific data parallel approach). The class library is extended automatically with optimised operations, generated by a transformation system, giving improved execution performance. The parallel implementation of the model described here is based on MPI and has been tested on a C40 processor network, a quad-processor Unix workstation, and a network of PCs running Linux. Timings are included to indicate the impact of the automatic optimisation facility (rather than the effect of parallelisation). }
}


@Article{byrne:mpi-app,
author = {G. D. Byrne and A. C. Hindmarsh},
title = {{PVODE}, an {ODE} solver for parallel computers},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 4,
pages = {354--365},
abstract = {PVODE is a general-purpose solver for ordinary differential equation (ODE) systems that implements methods for both stiff and nonstiff systems. The code is designed for single-program multiple-data environments. It is writtenin ANSI standard C, with a highly modular structure. The version being distributed uses the message-passing interface (MPI) system for communication.In the stiff case, PVODE uses a backward differentiation formula method combined with preconditioned GMRES iteration. Parallelism is achieved by distributing the ODE solution vector into user-specified segments and parallelizing a set of vector kernels accordingly. For PDE-based ODE systems, we provide a module that generates a band block-diagonal preconditioner for use with the GMRES iteration. We also provide a set of interfaces to accommodateFortran applications. The paper includes a stiff example problem and test results on a Cray-T3D with three different message-passing systems. PVODE is publicly available.}
}


@Article{Coelho:mpi-app,
author = {P. J. Coelho},
title = {Parallel simulation of a utility boiler. Part {I}: Mathematical model and numerical solution method},
journal = {Communications in Numerical Methods in Engineering},
year = 1999,
volume = 15,
number = 10,
month = OCT,
pages = {717--726},
abstract = {A computer code for the modelling of turbulent reactive flows with heat transfer has been parallelized and applied to the simulation of a utility boiler. The code is based on the numerical solution of the density-weighted averaged form of the governing equations for mass, momentum and energy conservation, and transport equations for scalars associated with the turbulence and combustion models. The k-epsilon model and the chemical equilibrium approach are used. The turbulent fluctuations are accounted for in the calculation of the mean properties by means of a presumed joint probability densityfunction for the mixture fraction and the fraction of radiative heat loss.The discrete ordinates method is used for radiation modelling. The governing equations are solved using the finite volume method. The parallelizationis carried out using the domain decomposition approach and the message-passing MPI library. The paper is divided into two parts. This part is concerned with the description of the model and the parallel implementation, whilethe model evaluation and the analysis of the parallel performance are presented in Part II (pp. 727-736).}
}


@Article{Torres:mpi-app,
author = {D. J. Torres and E. A. Coutsias},
title = {Pseudospectral solution of the two-dimensional {N}avier-{S}tokes equations in a disk},
journal = {SIAM Journal on Scientific Computing},
year = 1999,
volume = 21,
number = 1,
month = SEP,
pages = {378--403},
abstract = {An efficient and accurate algorithm for solving the two-dimensional(2D) incompressible Navier-Stokes equations on a disk with no-slip boundary conditions is described. The vorticity-stream function formulation of these equations is used, and spatially the vorticity and stream functions are expressedas Fourier-Chebyshev expansions. The Poisson and Helmholtz equations whicharise from the implicit-explicit time marching scheme are solved as bandedsystems using a post-conditioned spectral tau-method. The polar coordinatesingularity is handled by expanding fields radially over the entire diameter using a parity modified Chebyshev series and building partial regularityinto the vorticity. The no-slip boundary condition is enforced by transferring one of the two boundary conditions imposed on the stream function ontothe vorticity via a solvability constraint. Significant gains in run timeswere realized by parallelizing the code in message passage interface (MPI).}
}


@Article{Ann99:mpi-app,
author = {V. Annamalai and C. S. Krishnamoorthy and V. Kamakoti},
title = {Adaptive finite element analysis on a parallel and distributed environment},
journal = {Parallel Computing},
year = 1999,
volume = 25,
number = 12,
month = NOV,
pages = {1413--1434},
abstract = {Industries in general and automotive industries in particular, use Finite Element Analysis (FEA) for better solutions to the engineering problems theyencounter. The reliability of the Finite Element method can be improved toa larger extent by Adaptive Finite Element Analysis (AFEA), As we look towards increasingly accurate solutions, the process becomes computationally intensive and requires parallel and economic high-performance scientific computing environments to solve them. In this paper we present a parallel implementation of AFEA on a cluster of workstations and illustrate its efficiency and scalability with examples. In this process, we have developed a user-friendly environment for Parallel Distributed computing which is portable on top of both Parallel Virtual Machine (PVM) and Message Passing Interface(MPI) message passing layers. We have addressed the issues of the several stages in AFEA from a parallel computing perspective that includes Domain decomposition, Parallel Mesh generation, Parallel Finite Element Analysis using a Substructuring technique and Load balancing.}
}


@Article{Nagar99:mpi-impl,
author = {S. Nagar and A. Banerjee and A. Sivasubramaniam and C. R. Das},
title = {Alternatives to coscheduling a network of workstations},
journal = {Journal of Parallel and Distributed Computing},
year = 1999,
volume = 59,
number = 2,
month = NOV,
pages = {302--327},
abstract = {Efficient scheduling of processes on processors of a Network of Workstations (NOW) is essential for good system performance. However, the design of such schedulers is challenging because of the complex interaction between several system and workload parameters. Coscheduling, though desirable, is impractical for such a loosely coupled environment. Two operations, waiting for a message and arrival of a message, can be used to take remedial actions that can guide the behavior of the system toward coscheduling using local information. We present a taxonomy of three possibilities for each of these two operations. leading to a design space of 3x3 scheduling mechanisms. This paper presents an extensive implementation and evaluation exercise in studying these mechanisms. Adhering to the philosophy that scheduling and communication are intertwined and should be studied in conjunction, a complete communication substrate for UltraSPARC workstations, connected by Myrinet and running Solaris 2.5.1, has been developed. This platform provides the entire Message Passing Interface (MPI) to readily run off-the-shelf MPI applications by employing protected low-latency user-level messaging. Several applications can concurrently use this interface. This platform has been usedto design. implement, and uniformly evaluate nine scheduling strategies with a mixture of concurrent real applications with varying communication intensities. This includes five new schemes (Periodic Boost, Periodic Boost with Spin Block, Spin Yield, Periodic Boost with Spin Yield, Dynamic Coscheduling with Spin Yield) that are presented in this paper. In addition to our evaluations of the pms and cons of each mechanism in terms of throughput, response time, CPU utilization, and Fairness, it is shown that Periodic Boost is a promising approach for scheduling processes on a NOW.}
}


@Article{Lappa99:mpi-app,
author = {M. Lappa and R. Savino},
title = {Parallel solution of three-dimensional {M}arangoni flow in liquid bridges},
journal = {International Journal for Numerical Methods in Fluids},
year = 1999,
volume = 31,
number = 6,
month = NOV,
pages = {911--935},
abstract = {This paper describes the implementation and performances of a parallel solver for the direct numerical simulation of the three-dimensional and time-dependent Navier-Stokes equations on distributed-memory, massively parallel computers. The feasibility of this approach to study Marangoni flow instability in half zone liquid bridges is examined. The results indicate that the incompressible, non-linear Navier-Stokes problem, governing the Marangoni flows behavior, can effectively be parallelized on a distributed memory parallel machine by remapping the distributed data structure. The numerical code is based on a three-dimensional Simplified Marker and Cell (SMAC) primitive variable method applied to a staggered finite difference grid. Using this method, the problem is split into two problems, one parabolic and the other elliptic A parallel algorithm, explicit in time, is utilized to solve the parabolic equations. A parallel multisplitting kernel is introduced for the solution of the pseudo pressure elliptic equation, representing the mosttime-consuming part of the algorithm. A grid-partition strategy is used inthe parallel implementations of both the parabolic equations and the multisplitting elliptic kernel. A Message Passing Interface (MPI) is coded for the boundary conditions; this protocol is portable to different systems supporting this interface for interprocessor communications. Numerical experiments illustrate good numerical properties and parallel efficiency. In particular, good scalability on a large number of processors can be achieved as long as the granularity of the parallel application is not too small. However, increasing the number of processors, the Speed-Up is ever smaller than the ideal linear Speed-Up. The communication timings indicate that complex practical calculations, such as the solutions of the Navier-Stokes equationsfor the numerical simulation of the instability of Marangoni flows, can beexpected to run on a massively parallel machine with good efficiency.}
}

@Article{hill99:mpi-app,
author = {R. W. Hill and K. S. Ball},
title = {Parallel implementation of a {F}ourier-{C}hebyshev collocation method for incompressible fluid flow and heat transfer},
journal = {Numerical Heat Transfer Part B},
year = 1999,
volume = 36,
number = 3,
month = {Oct-Nov},
pages = {309--329},
abstract = { A Fourier-Chebyshev collocation spectral method is parallelized to simulatethe three-dimensional unsteady flow and heat transfer inside a cylindricalenclosure. Two solution approaches using different techniques for determining the pressure field and enforcing mass conservation are presented for shared memory applications using Cray directives and for distributed memory applications using MPI and SHMEM message passing libraries. Matrix diagonalization is employed for solving the pressure Poisson equation and Helmholtz equations for the velocity components and temperature. The parallelization approach is described and scaling results are presented for both platform types.}
}


@Article{poggi:mpi-extension,
author = {A. Poggi and G. Destri},
title = {{MPOOL}: an object-oriented library for task composition and co-ordination},
journal = {Concurrency-Practice and Experience},
year = 1999,
volume = 11,
number = 14,
month = DEC,
pages = {835--848},
abstract = { MPOOL is an object-oriented extension to the MPI library, based on three categories of objects, called units, groups and schemes. Units are active objects composed of data (state) and procedures (like traditional passive objects), but with the additional ability to store incoming messages in a queuewhile they are active and to send messages in parallel to other units; moreover, different units may be active simultaneously. Groups and schemes arepassive objects used for the composition of units and the co-ordination oftheir actions, Groups manage collective communications and synchronizationoperations such as barriers. Schemes compose units' actions through the use of a set of constructs derived by path expressions.}
}

@Article{sel99:mpi-app,
author = {P. M. Selwood and M. Berzins},
title = {Parallel unstructured tetrahedral mesh adaptation: algorithms, implementation and scalability},
journal = {Concurrency-Practice and Experience},
year = 1999,
volume = 11,
number = 14,
month = DEC,
pages = {863--884},
abstract = { The use of unstructured adaptive tetrahedral meshes in the solution of transient flows poses a challenge for parallel computing due to the irregular and frequently changing nature of the data and its distribution. A parallel mesh adaptation algorithm, PTETRAD, for unstructured tetrahedral meshes (based on the serial code TETRAD) is described and analysed. The portable implementation of the parallel code in C with MPI is described and discussed, The scalability of the code is considered, analysed and illustrated by numerical experiments using a shock wave diffraction problem. }
}

@Article{meme:mpi-graphics-app,
author = {D. Meneveaux and K. Bouatouch},
title = {Synchronisation and load balancing for parallel hierarchical radiosity of complex scenes on a heterogeneous computer network},
journal = {Computer Graphics Forum},
year = 1999,
volume = 18,
number = 4,
month = DEC,
pages = {201--212},
abstract = {In this paper ae propose a SPMD parallel hierarchical radiosity algorithm relying on a novel partitioning method which may apply, to any kind of archilectural scene. This algorithm is based on MPI (Message Passing Interface),a communication library which allows the use of either a heterogeneous setof concurrent computers or a parallel computer or both. The database is stored on a common directory and accessed by all the processors (through NFS in case of a network of computers). As the objective is to handle complex scenes such as building interiors, to cope with the problem of memory size, only a subset of the database resides in memory of each processor. This subset is determined with the help of a partitioning into 3D cells, clusteringand visibility calculations. A graph expressing visibility between the resulting clusters is determined partitioned (with a new method based on classification of K-means type) and distributed amongst all the processors. Eachprocessor is responsible for gathering energy (using the Gauss-Seidel method) only for its subset of clusters. In order to reduce the disk transfers due to downloading these subsets of clusters, we use an ordering strategy based on the traveling salesman algorithm. Dynamic load balancing relies on a task stealing approach while termination is detected by configuring the processors into a ring and moving a token around this ring. The parallel iterative resolution is of group iterative type. Its mathematical convergence is proven in the appendix.}
}

@Article{bova2000:mpi-app,
author = {S. W. Bova and G. F. Carey},
title = {A distributed memory parallel element-by-element scheme for semiconductor device simulation},
journal = {Computer Methods in Applied Mechanics and Engineering},
year = 1999,
volume = 181,
number = 4,
pages = {403--423},
abstract = { A domain decomposition and parallel element-by-element (EBE) scheme is developed for semiconductor device simulation modeled by the drift-diffusion (DD) equations. A classical Gummel iterative decoupling of the potential and carrier transport equations is applied on an unstructured triangulation. The distributed memory EBE scheme is formulated for a Galerkin finite elementapproximation of the nonlinear Poisson problem, and a modified Scharfetter-Gummel method is used for the carrier transport problem. The resulting sequences of symmetric and nonsymmetric linear systems are solved via preconditioned Krylov methods. Unstructured triangular grids are used to permit grading of the mesh, which is then partitioned to processor subdomains with appropriate data structures for message passing. Details of the parallel algorithm and data structure are provided. The scheme is implemented in Fortran90 with MPI and performance results are presented for a representative MOSFET on an IBM SP, a CRAY T3E, and an SGI/CRAY Origin2000.}
}

@Article{bova2000:mpi-openmp-app,
author = {S. W. Bova and C. P. Breshears and C. E. Cuicchi and Z. Demirbilek and H. A. Gabb},
title = {Dual-level parallel analysis of harbor wave response using {MPI} and {OpenMP}},
journal = {International Journal of High Performance Computing Applications},
year = 2000,
volume = 14,
number = 1,
pages = {49--64},
abstract = {The authors describe their experiences converting an existing serial production code to a parallel code combining both MPI and OpenMP. Such dual-levelparallel codes will be able to take full advantage of the emerging class of high performance computer architectures using small clusters of shared-memory processors connected via a message-passing network. While the focus isrestricted to a harbor response simulation code, the techniques presented herein are appropriate for a broad class of applications that explore a parameter space. The code modifications reduced the execution time of one testcase from 3100 minutes on a single CPU to just over 12 minutes on 256 CPUs. Results demonstrate that dual-level parallelism allows substantial increases in model resolution combined with improvements in simulation turnaroundtime but, contrary to conventional wisdom, requires very little source code alteration.}
}

@Article{park99:mpi-app,
author = {N. Park and V. K. Prasanna and C. S. Raghavendra},
title = {Efficient algorithms for block-cyclic array redistribution between processor sets},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 1999,
volume = 10,
number = 12,
month = DEC,
pages = {1217--1240},
abstract = {Run-time array redistribution is necessary to enhance the performance of parallel programs on distributed memory supercomputers. In this paper, we present an efficient algorithm for array redistribution from cyclic(x) on P processors to cyclic(Kx) on Q processors. The algorithm reduces the overall time for communication by considering the data transfer, communication schedule, and index computation costs. The proposed algorithm is based on a generalized circulant matrix formalism. Our algorithm generates a schedule thatminimizes the number of communication steps and eliminates node contentionin each communication step. The network bandwidth is fully utilized by ensuring that equal-sized messages are transferred in each communication step.Furthermore, the time to compute the schedule and the index sets is significantly smaller. It takes O(maz(P, Q)) time and is less than 1 percent of the data transfer time. In comparison, the schedule computation time using the state-of-the-art scheme (which is based on the bipartite matching scheme) is 10 to 50 percent of the data transfer time for similar problem sizes. Therefore, our proposed algorithm is suitable for run-time array redistribution. To evaluate the performance of our scheme, we have implemented the algorithm using C and MPI on an IBM SP2. Results show that our algorithm performs better than the previous algorithms with respect to the total redistribution time, which includes the time for data transfer. schedule, and indexcomputation.}
}

@Article{dan00:mpi-app,
author = {K. T. Danielson and S. Hao and W. K. Liu and R. A. Uras and S. F. Li},
title = {Parallel computation of meshless methods for explicit dynamic analysis},
journal = {International Journal for Numerical Methods in Engineering},
year = 2000,
volume = 47,
number = 7,
month = MAR,
pages = {1323-1341},
abstract = {A parallel computational implementation of modern meshless methods is presented for explicit dynamic analysis. The procedures are demonstrated by application of the Reproducing Kernel Particle Method (RKPM). Aspects of a coarse grain parallel paradigm are detailed for a Lagrangian formulation using model partitioning. Integration points are uniquely defined on separate processors and particle definitions are duplicated, as necessary, so that all support particles for each point are defined locally on the corresponding processor. Several partitioning schemes are considered and a reduced graph-based procedure is presented. Partitioning issues are discussed and procedures to accommodate essential boundary conditions in parallel are presented. Explicit MPI message passing statements are used for all communications among partitions on different processors. The effectiveness of the procedure is demonstrated by highly deformable inelastic example problems.}
}

@Article{mar00:mpi-app,
author = {N. Marco and S. Lanteri},
title = {A two-level parallelization strategy for Genetic Algorithms applied to optimum shape design},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = 4,
month = MAR,
pages = {377--397},
abstract = {This pager presents a two-level strategy for the parallelization of a Genetic Algorithm (GA) coupled to a compressible flow solver designed on unstructured triangular meshes. The parallel implementation is based on MPI and makes use of the process group features of this environment. The resulting algorithm is used for the optimum shape design of aerodynamic configurations.Numerical and performance results are presented for the optimization of two-dimensional airfoils for calculations performed on the following systems:an SGI Origin 2000 and an IBM SP-2 MIMD systems; an Pentium Pro (P6/200 MHz) cluster where the interconnection is realized through a FastEthernet (100 Mbits/s) switch. }
}


@Article{An00:mpi-app,
author = {R. E. Ansorge and T. A. Carpenter and L. D. Hall and N. R. Shaw and G. B. Williams},
title = {Use of parallel supercomputing to design magnetic resonance systems},
journal = {IEEE Transactions on Applied Superconductivity},
year = 2000,
volume = 10,
number = 1,
month = MAR,
pages = {1368--1371},
abstract = {Historically analytical methods have been the preferred approach to designing magnets and gradient sets for magnetic resonance systems. Such methods are computationally efficient but are approximate, particularly away from the axis of symmetry. Alternative methods, which are much more computationally intensive, for example Genetic Algorithms, are now becoming practical, Such methods have the advantage that they can be used for unconventional designs and for the inclusion of nonanalytical design constraints such as real-word engineering and cost limitations. Gradient coil designs have been published previously [1]-[3]. Now with the availability of more powerful computers, more ambitious designs can be undertaken using parallel computing methods. The use of a Hitachi SR2201 supercomputer and clusters of Linux PCs (Beowulf) to develop a short whole body MRI magnet for clinical applications are reported on. An important feature of these computer codes is that they have been developed to run on parallel computing systems using the MPI message passing standard. MPI is an accepted industry standard, which means that these codes can readily be ported to different parallel computers. Previous success has been achieved in using MPI for a variety of other Medical Imaging problems [4].}
}

@InProceedings{cle95:mpi-debugging,
author = {C. Cl\'emen\,con and J. Fritscher and M. J. Meehan and R. R\"uhl},
title = {An Implementation of Race Detection and Deterministic Replay with {MPI}},
booktitle = {Proceedings of Euro-Par'95},
number = 966,
series = {LNCS},
year = 1995,
publisher = {Springer-Verlag},
month = AUG,
pages = {155-166},
meetingloc = {Stockholm, Sweden}
}


@Article{danad00:mpi-app,
author = {K. T. Danielson and M. D. Adley},
title = {A meshless treatment of three-dimensional penetrator targets for parallel computation},
journal = {Computational Mechanics},
year = 2000,
volume = 25,
number = 3,
month = MAR,
pages = {267--273},
abstract = {A meshless modeling procedure of three-dimensional targets for penetration analysis on parallel computing systems is described. Buried structures are modeled by arbitrary layers of concrete and geologic materials, and the projectile is modeled by standard finite elements. Penetration resistance of the buried structure is provided by functions derived from principles of dynamic cavity expansion. The resistance functions are influenced by the target material properties and projectile kinematics. Additional capabilities accommodate the varying structural and geometrical characteristics of the target. Coupling between the finite elements and the meshless target model is made by applying resistance loads to elements on the outer surface of the projectile mesh. Penetration experiments verify the approach. In this manner, the target is effectively modeled and the strategy is well suited for parallel processing. The procedure is incorporated into an explicit transient dynamics code, using mesh partitioning for a coarse grain parallel processing paradigm. Message Passing Interface (MPI) is used for all interprocessorcommunication. Large detailed finite element analyses of projectiles are performed on up to several hundred processors with excellent scalability. The efficiency of the strategy is demonstrated by analyses executed on several types of scalable computing platforms.}
}

@Article{kim00:mpi-app,
author = {S. Kim},
title = {Lattice {QCD} on a beowulf cluster},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 2000,
volume = 83,
number = 4,
month = APR,
pages = {807--809},
abstract = { Using commodity component personal computers based on Alpha processor and commodity network devices and a switch, we built an 8-node parallel computer. GNU/Linux is chosen as an operating system and message passing libraries Such as PVM, LAM, and MPICH have been tested as a parallel programming environment. We discuss our lattice QCD project for a heavy quark system on this computer.}
}

@Article{wat00:mpi-app,
author = {N. Watari and S. Ohnishi and H. Onishi and Y. Iwasawa},
title = {Total energy estimation for {Pd/Al} bimetallic surfaces by a parallel computation scheme},
journal = {Japanese Journal of Applied Physics Part 1---Regular Papers Short Notes \& Review Papers},
year = 2000,
volume = 39,
number = {3A},
month = MAR,
pages = {1457--1461},
Abstract = { A numerical calculation scheme for the multicenter problem in large molecules and clusters is presented by applying the message-passing inter-face (MPI) in a massively parallel computer that uses the density functional method. The multicenter problem associated with the Coulomb singularity of an atom is efficiently treated by the parallel processors by allocating several atoms into each processor element (PE). The order N-2/P tuning is obtained for the Coulomb energy calculation by using the MPI which transfers Coulomb potential field between PE's. This method is applied to estimate the total energy of the reconstructed Al/Pd bimetallic surface. The energy estimationby the charge density of a superposition of isolated atomic charge fragments predict a stabilization caused by the reconstruction, being consistent with a self-consistent-field (SCF) cluster calculation of the bimetallic surface.}
}

@Article{rod00:mpi-model,
author = {C. Rodriguez and J. L. Roda and F. Sande and D. G. Morales and F. Almeida},
title = {A new parallel model for the analysis of asynchronous algorithms},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = 6,
month = MAY,
pages = {753--767},
abstract = {The BSP model barrier synchronization imposes some limits both in the rangeof available algorithms and also in their performance. Although BSP programs can be translated to MPI/PVM programs, the counterpart is not true. The asynchronous nature of some MPI/PVM programs does not easily fit inside theBSP model. Through the suppression of barriers and the generalization of the concept of superstep we propose two new models, the BSP-like and the BSPwithout barriers (BSPWB) models. While the BSP-like extends the BSP* modelto programs written using collective operations, the more general BSPWB model admits the MPI/PVM parallel asynchronous programming style. The parameters of the models and their quality are evaluated on four standard parallelplatforms: the Cray T3E, the IBM SP2, the Origin 2000 and the Digital Alpha Server 8400. The study shows that the time spent in an h-relation is moreindependent on the number of processors than on the communication pattern.We illustrate the use of these BSP extensions through two problem-solving paradigms: the Nested Parallel Recursive Divide and Conquer Paradigm and the Virtual Pipeline Dynamic Programming Paradigm. The proposed paradigms explain how nested parallelism and processor virtualization can be introduced in MPI and PVM without having any negative impact in the performance and model accuracy. The prediction of the communication times is robust even for problems, where communication is dominated by small messages. }
}

@Article{Lie00:mpi-app,
author = {C. C. Liew and T. Ikeshoji and N. Saito and H. Inomata},
title = {Domain-shifting algorithm: A new domain-decomposition scheme for molecular dynamics simulations on parallel computers},
journal = {Progress of Theoretical Physics Supplement},
year = 2000,
number = 138,
pages = {205--210},
abstract = {A domain is conventionally defined as a stationary sub-region of the simulated system in a domain-decomposition scheme for molecular dynamics (MD) simulations on parallel computers. We proposed an algorithm where all domains pre-assigned to processors are shifted to a particular direction, beyond the displacement of particles in the system during a time-step or a period ofsmall time-steps; as a result, it allows us to reduce the data transfer partners in the particle re-allocation procedure. We also proposed a systematic link-cell method that allows us to make use of small domain and reduces the amount of data to be transferred for updating the positions and forces of particles, in comparison to the conventional schemes. Benchmark studies of a three-dimensional Lennard-Jones system have been carried out using a parallel MD simulation program implemented via a MPI-based message-passing interface on several parallel computers. A result on a 16-CPU parallel computer system shows that the new scheme allows us to achieve a high parallel efficiency (over 75\%) for MD simulations of a system with relatively small number of particles per processor (N/P $<$ 500).}
}


@Article{decyk00:mpi-app,
author = {V. K. Decyk and D. E. Dauger and P. R. Kokelaar},
title = {Plasma physics calculations on a parallel {M}acintosh cluster},
journal = {Physica Scripta},
year = 2000,
volume = {T84},
pages = {85--88},
abstract = {We have constructed a parallel cluster consisting of 16 Apple Macintosh G3 computers running the MacOS, and achieved very good performance on numerically intensive, parallel plasma particle-in-cell simulations. A subset of the MPI message-passing library was implemented in Fortran77 and C. This library enabled us to port code, without modification, from other parallel processors to the Macintosh cluster. For large problems where message packets are large and relatively few in number, performance of 50-150 MFlops/node ispossible, depending on the problem. This is fast enough that 3D calculations can be routinely done. Unlike Unix-based clusters, no special expertise in operating systems is required to build and run the cluster. Full detailsare available on our web site: http://exodus.physics.ucla.edu/ appleseed/.}
}

@Article{ma99:mpi-app,
author = {S. B. Ma},
title = {Comparisons of the parallel preconditioners on the {CRAY-T3E} for large nonsymmetric linear systems},
journal = {International Journal of High Speed Computing},
year = 1999,
volume = 10,
number = 3,
month = SEP,
pages = {285--300},
abstract = {In this paper we consider five types of parallel preconditioners for solving large sparse nonsymmetric linear systems on the CRAY-T3E. They are ILU(0)in the wavefront ordering, ILU(0) in the multi-coloring ordering, SSOR in the wavefront ordering, the SPAI(SParse Approximate Inverse) preconditioner, and finally Multi-color Block SOR preconditioner. The ILU(0) is known to be robust and the wavefront ordering naturally exploits the parallelism buthas a limited speedup due to the nonuniform lengths of the wavefronts. Multi-coloring is an efficient way of introducing the parallelism of order(N),where N is the order of the matrix but the convergence rate often deteriorates. The SPAI type preconditioner is inherently parallel and is gaining popularity. Finally, for the 5-point Laplacian matrix SOR method is known to have a nondeteriorating rate of convergence when the multi-coloring order is adopted. Also, Block SOR is expected to incur less communication overheads in a message-passing machine. Hence, Multi-Color Block SOR method is expected to have a good performance. Experiments were conducted for the Finite Difference discretizations of two problems with various meshsizes varying up to 1024 x 1024. MPI library was used for interprocess communications. Theresults show that ILU(0) in the multi-coloring ordering gives the best performance.}
}

@Article{pra00:mpi-sim,
author = {S. Prakash and E. Deelman and R. Bagrodia},
title = {Asynchronous parallel simulation of parallel programs},
journal = {IEEE Transactions on Software Engineering},
year = 2000,
volume = 26,
number = 5,
month = {MAY},
pages = {385--400},
abstract = {Parallel simulation of parallel programs for large datasets has been shown to offer significant reduction in the execution time of many discrete eventmodels. This paper describes the design and implementation of MPI-SIM, a library for the execution driven parallel simulation of task and data parallel programs. MPI-SIM can he used to predict the performance of existing programs written using MPI for message-passing, or written in UC, a data parallel language, compiled to use message-passing. The simulation models can beexecuted sequentially or in parallel. Parallel execution of the models aresynchronized using a set of asynchronous conservative protocols. This paper demonstrates how protocol performance is improved by the use of application-level, runtime analysis. The analysis targets the communication patternsof the application. We show the application-level analysis for message passing and data parallel languages. We present the validation and performanceresults for the simulator for a set of applications that include the NAS Parallel Benchmark suite. The application-level optimization described in this paper yielded significant performance improvements in the simulation of parallel programs, and in some cases completely eliminated the synchronizations in the parallel execution of the simulation model.}
}

@Article{gram00:mpi-alg,
author = {M. D. Grammatikakis and S. Liesche},
title = {Priority queues and sorting methods for parallel simulation},
journal = {IEEE Transactions on Software Engineering},
year = 2000,
volume = 5,
number = 26,
month = MAY,
pages = {401--422},
abstract = {We examine the design, implementation, and experimental analysis of parallel priority queues for device and network simulation. We consider: (1) distributed splay trees using MPI, (2) concurrent heaps using shared memory atomiclocks, and (3) a new, more general concurrent data structure based on distributed sorted lists, which is designed to provide dynamically balanced workallocation (with automatic or manual control) and efficient use of shared memory resources. We evaluate performance for all three data structures on a Cray-T3E900 system at KFA-Julich. Our comparisons are based on simulations of single buffers and a 64 x 64 packet switch which supports multicasting. In all implementations, PEs monitor traffic at their preassigned input/output ports, while priority queue elements are distributed across the Cray-T3E virtual shared memory. Our experiments with up to 60,000 packets and twoto 64 PEs indicate that concurrent priority queues perform much better than distributed ones. Both concurrent implementations have comparable performance, while our new data structure uses less memory and has been further optimized. We also consider parallel simulation for symmetric networks by sorting integer conflict functions and implementing an interesting packet indexing scheme. The optimized message passing network simulator can process similar to 500K packet moves in one second, with an efficiency that exceeds similar to 50 percent for a few thousands packets on the Cray-TBE with 32 PEs. All developed data structures now form a parallel library. Although our concurrent implementations use the Cray-T3E ShMem library, portability can be derived from Open-MP or MPI-2 standard libraries, which will provide support for one-way communication and shared memory lock mechanisms.}
}

@Article{bad00:mpi-app,
author = {S. B. Baden and S. J. Fink},
title = {A programming methodology for dual-tier multicomputers},
journal = {IEEE Transactions on Software Engineering},
year = 2000,
volume = 26,
number = 3,
month = MAR,
pages = {212--226},
abstract = {Hierarchically organized ensembles of shared memory multiprocessors possessa richer and more complex model of locality than previous generation multicomputers with single processor nodes. These dual-tier computers introduce many new factors into the programmer's performance model. We present a methodology for implementing block-structured numerical applications on dual-tier computers and a run-time infrastructure, called KeLP2, that implements the methodology. KeLP2 supports two levels of locality and parallelism via hierarchical SPMD control flow, run-time geometric meta-data, and asynchronous collective communication. KeLP applications can effectively overlap communication with computation under conditions where nonblocking point-to-point message passing fails to do so. KeLP's abstractions hide considerable detail without sacrificing performance and dual-tier applications written in KeLP consistently outperform equivalent single-tier implementations written in MPI. We describe the KeLP2 model and show how it facilitates the implementation of five block-structured applications specially formulated to hide communication latency on dual-tiered architectures. We support our arguments with empirical data from applications running on various single- and dual-tier multicomputers. KeLP2 supports a migration path from single-tier to dual-tier platforms and we illustrate this capability with a detailed programming example.}
}

@Article{gor00:mpi-theory,
author = {S. Gorlatch},
title = {Toward formally-based design of message passing programs},
journal = {IEEE Transactions on Software Engineering},
year = 2000,
volume = 26,
number = 3,
month = MAR,
pages = {276--288},
abstract = {We present a systematic approach to the development of message passing programs. Our programming model is SPMD, with communications restricted to collective operations: scan, reduction, gather, etc. The design process in suchan architecture-independent language is based on correctness-preserving transformation rules, provable in a formal functional framework. We develop aset of design rules for composition and decomposition. For example, scan followed by reduction is replaced by a single reduction, and global reduction is decomposed into two faster operations. The impact of the design rules on the target performance is estimated analytically and tested in machine experiments. As a case study, we design two provably correct, efficient programs using the Message Passing interface (MPI) for the famous maximum segment sum problem, starting from an intuitive, but inefficient, algorithm specification.}
}

@Article{hos00:mpi-app,
author = {A. Hossinger and E. Langer and S. Selberherr},
title = {Parallelization of a {M}onte {C}arlo ion implantation simulator},
journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
year = 2000,
volume = 19,
number = 5,
month = MAY,
pages = {560--567},
abstract = {We present a parallelization method based on message passing interface (MPI) for a Monte Carlo program for two-dimensional and three-dimensional (3-D)simulation of ion implantations. We use a master-slave strategy where the master process synchronizes the slaves and performs the input-output operations, while the slaves perform the physical simulation. For this method thesimulation domain is geometrically distributed among several CPU's which have to exchange only very little information during the simulation. Thereby, the communication overhead between the CPU's is kept so low that it has almost no influence on the performance gain even if a standard network of workstations is used instead of a massively parallel computer to perform the simulation. We have optimized the performance gain by identifying bottlenecks of this strategy when it is applied to arbitrary geometries consisting of various materials. This requires the application of different physical models within the simulation domain and makes it impossible to determine a reasonable domain distribution before starting the simulation. Due to a feedback between master and slaves by on-line performance measurements, we obtain an almost linear performance gain on a cluster of workstations with just slightly varying processor loads. Besides the increase in performance, the parallelization method also achieves a distribution of the required memory.This allows 3-D simulations on a cluster of workstations, where each single machines would not have enough memory to perform the simulation on its own.}
}


@Article{lee00:mpi-app,
author = {J. Y. Lee and J. Pillardy and C. Czaplewski and Y. Arnautova and D. R. Ripoll and A. Liwo and K. D. Gibson and R. J. Wawak and H. A. Scheraga},
title = {Efficient parallel algorithms in global optimization of potential energy functions for peptides, proteins, and crystals},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {399--411},
abstract = {Global optimization is playing an increasing role in physics, chemistry, and biophysical chemistry. One of the most important applications of global optimization is to find the global minima of the potential energy of molecules or molecular assemblies, such as crystals. The solution of this problem typically requires huge computational effort. Even the fastest processor available is not fast enough to carry out this kind of computation in real time for the problems of real interest, e.g., protein and crystal structure prediction. One way to circumvent this problem is to take advantage of massively parallel computing. In this paper, we provide several examples of parallel implementations of global optimization algorithms developed in our laboratory. All of these examples follow the master/worker approach. Most of the methods are parallelized on the algorithmic (coarse-grain) level and oneexample of fine-grain parallelism is given, in which the function evaluation itself is computationally expensive. All parallel algorithms were initially implemented on an IBM/SP2 (distributed-memory) machine. In all cases, however, message passing is handled through the standard Message Passing Interface (MPI); consequently the algorithms can also be implemented on any distributed- or shared-memory system that runs MPI. The efficiency of these implementations is discussed.}
}


@Article{Sri00:mpi-app,
author = {J. Srinivasan and Y. L. Volobuev and S. L. Mielke and D. G. Truhlar},
title = {Parallel {F}ourier Path-integral {M}onte {C}arlo calculations of absolute free energies and chemical equilibria},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {446--464},
abstract = {We present a parallel implementation of the Fourier Path Integral Monte Carlo method for calculating the absolute free energies of many-body systems. The implementation adopts the message-passing paradigm for parallelization.with the use of the Message Passing Interface (MPI) libraries. A portable computer program, written using Fortran 90. has been developed and tested on a variety of platforms such as the SGI Origin, the IBM SP. and the Cray T3D and T3E. We have used the program to demonstrate the efficacy of importance sampling in configuration space. We have also used die program to calculate the partition function. and hence the absolute free energies, of triatomic molecules and four-body systems.}
}

@Article{pra99:mpi-app,
author = {B. Prameela and L. M. Patnaik},
title = {Parallel implementation of alternate quadrant interlocking factorisation method on star topology},
journal = {International Journal of High Speed Computing},
year = 1999,
volume = 10,
number = 4,
month = DEC,
pages = {361--378},
abstract = {This paper discusses the parallel implementation of the solution of a set of linear equations using the Alternative Quadrant Interlocking Factorisation Methods (AQIF), on a star topology. Both the AQIF and LU decomposition methods are mapped onto star topology on an IBM SP2 system, with MPI as the internode communicator. Performance parameters such as speedup, efficiency have been obtained through experimental and theoretical means. The studies demonstrate (i) a mismatch of 15\% between the theoretical and experimental results, (ii) scalability of the AQIF algorithm, and (iii) faster executing AQIF algorithm.}
}


@Article{Roy00:mpi-app,
author = {S. Roy and R. Y. Jin and V. Chaudhary and W. L. Hase},
title = {Parallel molecular dynamics simulations of alkane/hydroxylated alpha-aluminum oxide interfaces},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {210--218},
abstract = {In this paper we describe a practical implementation of parallel computation for the molecular dynamics (MD) simulation of an alkane/aluminum oxide interface. A serial MD program was converted into a parallel code utilizing the message passing interface (MPI). This code was evaluated on a twelve processor symmetrical multiprocessor as well as on a cluster of four processorSMPs. A maximum speedup of 5.25 was achieved with twelve processors on thelarge shared memory machine. The cluster performance saturated at a speedup of 4.5 with two nodes, High communication costs and considerable load imbalance in the system were identified as areas that need further investigation for obtaining better performance.}
}


@Article{fur00:mpi-app,
author = {T. R. Furlani and J. Kong and P. M. W. Gill},
title = {Parallelization of {SCF} calculations within {Q-Chem}},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {170--177},
abstract = {We have incorporated MPI based parallelism with dynamic fond balance into the Hartree-Fock and DFT modules of Q-Chem. A series of benchmark calculations consisting of both single point energy and gradient calculations were carried out to gauge the performance of the parallel modules. Calculations were carried out on two different parallel computers, namely a shared memory Silicon Graphics Origin2000 and a distributed memory Cray T3E, to show the flexibility of the code and demonstrate the great utility of MPI. Scalability for the DFT and Hartree-Fock modules is demonstrated for up to 64 processors.} } Science B.V. All rights reserved.
@Article{Fle00:mpi-app,
author = {G. D. Fletcher and M. W. Schmidt and M. S. Gordon},
title = {The Distributed Data Interface in {GAMESS}},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {190--200},
abstract = {The Distributed Data Interface to permit storage of large data arrays in the aggregate memory of distributed memory, message passing computer systems is described. The design of this relatively small library is discussed, in regard to its implementation over SHMEM, MPI-1. or socket based message libraries. The good performance of a MP2 program using DDI is demonstrated on both PC and workstation cluster computers, and some details of the resulting message traffic are presented.}
}


@Article{She00:mpi-app,
author = {A. I. Shestakov and M. K. Prasad and J. L. Milovich and N. A. Gentile and J. F. Painter and G. Furnish},
title = {The radiation-hydrodynamic {ICF3D} code},
journal = {Computer Methods in Applied Mechanics and Engineering},
year = 2000,
volume = 187,
number = {1--2},
pages = {181--200},
abstract = {We describe the 3D high temperature plasma simulation computer code ICF3D which is being developed at the Lawrence Livermore National Laboratory. The code is portable; it runs on a variety of platforms: uniprocessors, SMPs, and MPPs. It parallelizes by decomposing physical space into disjoint subdomains and relies on message passing libraries such as MPI. ICF3D is written in the object oriented programming language C++. The mesh is unstructured and consists of a collection of hexahedra, prisms, pyramids, and/or tetrahedra. The hydrodynamics is modeled by the discontinuous finite element methodwhich allows a natural representation of inherently discontinuous phenomena such as shocks. Continuous processes such as diffusion are modeled by conventional finite element methods. ICF3D is modular and consists of separateequation-of-state, hydrodynamic, heat conduction, and multi-group radiation transport (diffusion approximation) packages. We present results on problems relevant to Inertial Confinement Fusion which are obtained on a varietyof computers, uniprocessors and MPPs.}
}

% Thanks to Jesper Larsson Traeff of CCRL NEC for the following
%
% Design
%
@inproceedings{Hempel94,
author = {Hempel, Rolf},
title = "The {MPI} Standard for Message Passing",
booktitle = "High--Performance Computing and Networking, InternationalConference and Exhibition, Proceedings, Volume II: Networking and Tools",
editor = {Gentzsch, Wolfgang and Harms, Uwe},
publisher = sv,
series = lncs,
volume = 797,
pages = {247--252},
year = 1994
}

@inproceedings{Hempel94:uberblick,
author = "Hempel, Rolf",
title = "Der {M}essage {P}assing {I}nterface~--~{S}tandard: ein {{\"U}}berblick",
booktitle = "Praxisorientierte {P}arallelverarbeitung,{B}eitr{{\"a}}ge zum 3. {W}orkshop {{\"u}}ber {W}issenschaftliches Rechnen, {S}chwerpunkt {P}raxixorientierte {P}arallelverarbeitung",
editor = "Horst Langend{{\"o}}rfer",
publisher = "Carl {H}anser {V}erlag",
address = "Braunschweig, Germany",
year = 1994
}

@inproceedings{Hempel96,
author = {Rolf Hempel},
title = "The Status of the {MPI} Message-Passing Standard andIts Relation to {PVM}",
booktitle = "{P}arallel {V}irtual {M}achine -- {E}uro{PVM}'96",
editor = "Bode, Arndt and Dongarra, Jack and Ludwig, Thomas andSunderam, Vaidy",
publisher = sv,
series = lncs,
volume = 1156,
pages = {14--21},
year = 1996
}

@Article{HempelWalker99,
Author = {Rolf Hempel and David W. Walker},
Title = "The Emergence of the {MPI} Message Passing Standard forParallel Computing",
Journal = "{C}omputer {S}tandards \& {I}nterfaces",
Publisher = {Elsevier Science},
volume = 21,
year = 1999,
Pages = {51--62}
}

%Implementation
%==============
%
%SX
%--

@inproceedings{Hempel96:mpisx,
author = "Hempel, Rolf",
title = "The {MPI} Message--Passing Standard and its Implementationon the {NEC SX--4}",
booktitle = "Proceedings of the {NEC HPC} Workshop",
editor = "Doi, Shun",
address = "Tokyo, Japan",
year = "1996"
}

@inproceedings{HempelRitzdorfZimmermann97,
Author = {Rolf Hempel and Hubert Ritzdorf and Falk Zimmermann},
Title = "Implementation of {MPI} on {NEC}'s {SX-4} Multi-Node Architecture",
Booktitle = {Recent Advances in Parallel Virtual Machine andMessage Passing Interface. 4th European {PVM/MPI} Users' Group Meeting},
publisher = sv,
Series = lncs,
Volume = 1332,
Year = 1997,
Pages = {185--193},
}

@Article{HempelRitzdorfZimmermann98,
Author = {Rolf Hempel and Hubert Ritzdorf and Falk Zimmermann},
Title = "Efficient Message Passing Interface Implementations for{NEC} Parallel Computers",
Journal = {{NEC} Research \& Development},
Volume = 39,
Number = 4,
Year = 1998,
Pages = {408--413}
}

@inproceedings{TraffHempelRitzdorfZimmermann99,
Author = {Jesper Larsson Tr{\"{a}}ff andRolf Hempel and Hubert Ritzdorf and Falk Zimmermann},
Title = "Flattening on the fly: efficient handling of {MPI} derived datatypes",
Booktitle = {Recent Advances in Parallel Virtual Machine andMessage Passing Interface. 6th European {PVM/MPI} Users' Group Meeting},
publisher = sv,
Series = lncs,
Volume = 1697,
Year = 1999
}

%Cluster etc.
%------------

@inproceedings{GolebiewskiBaumHempel99,
Author = {\fontencoding{T1}\selectfont Maciej {Go\symbol{"AA}\symbol{"A6}biewski}and Markus Baum and Rolf Hempel},
Title = "High Performance Implementation of {MPI} for {Myrinet}",
Booktitle = {Parallel Computation. 4th International Conference of the {ACPC}},
publisher = sv,
Series = lncs,
Volume = 1557,
Year = 1999,
Pages = {510--521}
}

@inproceedings{GolebiewskiHempelTraff99,
Author = {\fontencoding{T1}\selectfont Maciej {Go\symbol{"AA}\symbol{"A6}biewski}and Rolf Hempel and Jesper Larsson Tr{\"{a}}ff},
Title = "Algorithms for collective communication operations on {SMP} clusters",
Booktitle = {The 1999 Workshop on Cluster-Based Computing held in conjunction with 13th {ACM-SIGARCH} International Conference on Supercomputing{(ICS'99)}},
Pages = {11--15},
Year = 1999
}

@inproceedings{BaumGolebiewskiHempelTraff99,
Author = {Markus Baum and\fontencoding{T1}\selectfont Maciej {Go\symbol{"AA}\symbol{"A6}biewski} and Rolf Hempel and Jesper Larsson Tr{\"{a}}ff},
Title = "Dual-device {MPI} Implementation for {PC} Clusters with {SMP} Nodes",
Booktitle = {{MPIDC'99} Message Passing Interface Developer's and User'sConference Journal of Papers and Presentations},
Pages = {53--60},
Year = 1999
}

@inproceedings{GolebiewskiBasermannBaumHempelRitzdorfTraff99,
Author = {\fontencoding{T1}\selectfont M. {Go\symbol{"AA}\symbol{"A6}biewski}and A. Basermann and M. Baum and R. Hempel and H. Ritzdorf and J. L. Tr{\"{a}}ff},
Title = "A {PC} Cluster with Application-Quality {MPI}",
Booktitle = {Euro-Par'99 Parallel Processing},
publisher = sv,
Series = lncs,
Volume = 1685,
Year = 1999,
Pages = {613--623},
}

%Tools
%=====

@inproceedings{HempelZimmermann96,
author = {R. Hempel and F. Zimmermann},
title = "On the automatic {PARMACS-to-MPI} transformation in application programs",
booktitle = "High-performance computing and networking:international conference and exhibition, {HPCN EUROPE} 1966, Brussels, Belgium, April 15--19, 1996: proceedings",
publisher = sv,
series = lncs,
volume = 1067,
year = 1996,
pages = {1033--1034}
}

@Article{HempelZimmermann99,
author = {Hempel, Rolf and Zimmermann, Falk},
title = "Automatic Migration from {PARMACS} to {MPI} in Parallel {F}ortran Applications",
journal = "{S}cientific {P}rogramming",
volume = 20,
number = 7,
year = 1999,
pages = {39--46}
}

@inproceedings{ReussnerTraffHunzelmann00,
Author = {Ralf Reussner and Jesper Larsson Tr{\"{a}}ff and Gunnar Hunzelmann},
Title = "A Benchmark for {MPI} Derived Datatypes",
Booktitle = {Recent Advances in Parallel Virtual Machine andMessage Passing Interface. 7th European {PVM/MPI} Users' Group Meeting},
Series = lncs,
Year = 2000,
Note = {To appear}
}

@inproceedings{FahringerGerndtRileyTraff00,
Author = {Thomas Fahringer and Michael Gerndt and Graham Riley andJesper Larsson Tr{\"{a}}ff},
Title = "Specification of Performance Problems in {MPI} Programs with {ASL}",
Booktitle = {International Conference in Parallel Processing {(ICPP'00)}},
Year = 2000,
Note = {To appear}
}

%Applications
%============

@inproceedings{Traff98,
Author = {Jesper Larsson Tr{\"{a}}ff},
Title = "Portable Randomized List Ranking on Multiprocessors using {{\sf MPI}}",
Booktitle = {Recent Advances in Parallel Virtual Machine andMessage Passing Interface. 5th European {PVM/MPI} Users' Group Meeting},
publisher = sv,
Series = lncs,
Volume = {1497},
Year = 1998,
Pages = {395--402}
}

%
% End of articles from NEC

@Article{bak00:mpi-app,
author = {J. Baker and M. Shirel},
title = {Ab initio quantum chemistry on PC-based parallel supercomputers},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = {7--8},
month = JUL,
pages = {1011--1024},
abstract = {The advent of mass-market personal computers (PC) and the associated price reduction in virtually all computer components has brought the cost of parallel, multi-processor computers down to highly affordable levels. Four-, eight-, and even 12-processor machines, constructed from basic, readily available PC components, can be obtained today for the same price as a good-quality single-processor workstation of a few years ago. Together with now well-established parallel tools (such as the message-passing interface (MPI) or parallel virtual machine (PVM) software), state-of-the-art, fully functioning, parallel machines using the Linux operating system and the latest PC microprocessors can deliver unprecedented price/performance ratios. This article reports on the capabilities and performance of a new, fully parallel ab initio program running on commercially available four- and eight-processor PC-based supercomputers.}
}

@Article{nob00:mpi-app,
author = {R. H. Nobes and A. P. Rendell and J. Nieplocha},
title = {Computational chemistry on {F}ujitsu vector-parallel processors: Hardware and programming environment},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = {7--8},
month = JUL,
pages = {869--886},
abstract = {In this and the following paper, we provide an introduction to the Fujitsu VPP range of vector-parallel supercomputers and to some of the computational chemistry software available for the VPP, Here, we consider the hardware and the design of software to exploit its capabilities. The VPP employs proprietary vector processors connected via a crossbar switch in a distributed-memory architecture. High single-node performance requires consideration of vector operand lengths, arithmetic pipe utilisation and memory-to-CPU bandwidth. Most parallel chemistry applications use either explicit 'message-passing' or a 'global-memory' paradigm, and benchmark results are presented for the communications performance of MPI, Linda and the Global Arrays.}
}


@Article{fru00:mpi-app,
author = {H. A. Fruchtl and R. H. Nobes and A. Bliznyuk},
title = {Performance of {MOPAC} on parallel computers},
journal = {Journal of Molecular Structure-Theochem},
year = 2000,
volume = 506,
number = {spec. SI},
month = JUL,
pages = {87--97},
abstract = {Key parts of the semiempirical MOPAC program package have been ported to parallel computers using the MPI message passing-library. Parallel routines are available for the calculation of vibrational frequencies and electrostatic potentials, as well as for energies of large biomolecules via the linear-scaling MOZYME self-consistent-held method. The parallelisation strategiesused are discussed, and performance measurements for benchmark calculations on three different parallel computers are presented. Frequency and ESP calculations show good scaling for up to eight nodes, independent of hardwareand communications software. MOZYME calculations scale reasonably well if a fast implementation of MPI is available.}
}

@Article{geo00:mpi-impl,
author = {W. L. George and J. G. Hagedorn and J. E. Devaney},
title = {{IMPI}: Making {MPI} interoperable},
journal = {Journal of Research of the National Institute of Standards and Technology},
year = 2000,
volume = 105,
number = 3,
pages = {343+},
month = {May-June},
abstract = {The Message Passing Interface (MPI) is the de facto standard for writing parallel scientific applications in the message passing programming paradigm.Implementations of MPI were not designed to interoperate, thereby limitingthe environments in which parallel jobs could be run. We briefly describe a set of protocols, designed by a steering committee of current implementors of MPI, that enable two or more implementations of MPI to interoperate within a single application. Specifically, we introduce the set of protocols collectively called Interoperable MPI (IMPI). These protocols make use of novel techniques to handle difficult requirements such as maintaining interoperability among all IMPI implementations while also allowing for the independent evolution of the collective communication algorithms used in IMPI. Our contribution to this effort has been as a facilitator for meetings, editor of the IMPI Specification document, and as an early testbed for implementations of IMPI. This testbed is in the form of an IMPI conformance tester,a system that can verify the correct operation of an IMPI-enabled version of MPI.}
}

@TechReport{kon00:mpi-measurement,
author = {Alice E. Koniges and Rolf Rabenseifner and Karl Solchenbach},
title = {Benchmark Design for Characterization of Balanced High-Performance Architectures},
institution = {},
year = 2000
}


@Article{kanTam:mpi-app,
author = {R. Kanapady and K. K. Tamma},
title = {A unified family of generalized integration operators [GInO] for non-linearstructural dynamics: implementation aspects},
journal = {Advances in Engineering Software},
year = 2000,
volume = 31,
number = {8--9},
pages = {639--647},
month = {Aug-Sep},
abstract = { The present paper proposes recent developments in theoretical and implementation aspects including parallel computations via a single analysis code ofa unified family of generalized integration operators [GInO] in time with particular emphasis on non-linear structural dynamics. The focus of this research is on the implementation aspects including the development of coarse-grained parallel computational models for such generalized time integration operators that he can readily ported to a wide range of parallel architectures via a message-passing paradigm (using MPI) and domain decomposition techniques. The implementation aspects are first described followed by an evaluation for a. range of problems which exhibit Large deformation, elastic,elastic-plastic dynamic behavior. For geometric non-linearity a total Lagrangian formulation and for material non linearity elasto-plastic formulations are employed. Serial and parallel performance issues on the SOI Origin 2000 system are discussed and analyzed for illustration for selected schemes. For illustration, particular forms of [GInO] are investigated and a complete development via a single analysis code is currently underway. Nevertheless, this is the first time that such a capability is plausible and the developments further enhance computational structural dynamics areas.}
}


@Article{Gur00:mpi-app,
author = {G. P. Guruswamy},
title = {{HiMAP}: a portable super modular multilevel parallel multidisciplinary process for large scale analysis},
journal = {Advances in Engineering Software},
year = 2000,
volume = 31,
number = {8--9},
pages = {617--620},
month = {Aug-Sep},
abstract = {An efficient super modular process to simulate aeroelasticity of aerospace vehicles using high fidelity flow equations such as the Euler/Navier-Stokesequations is presented. The process is suitable for both tightly coupled and uncoupled analysis. The process is designed to execute on massively parallel processors (MPP) and work-station clusters based on a multiple-instruction, multiple-data (MIMD) architecture. The fluids discipline is parallelized using a zonal approach whereas the structures discipline is parallelized using the substructures concept. provision is also made to include controls domain. Computations of each discipline are spread across processors using IEEE standard message passing interface (MPI) for inter processor communications. Disciplines can run in parallel using a macro utility MPIRUN developed based on MPI. In addition to discipline parallelization and coarse-grain parallelization of the disciplines, embarrassingly parallel capability to run multiple parameter cases is implemented using a script system. The combined effect of three levels of parallelization is an almost linear scalability for multiple concurrent analyses that pet-form efficiently on MPP.}
}

@Article{cfkl00:mpi-java,
author = {B. Carpenter and G. Fox and S. H. Ko and S. Lim},
title = {Object serialization for marshaling data in a {J}ava interface to {MPI}},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 7,
pages = {539--553},
month = JUN,
abstract = {Several Java bindings to Message Passing Interface (MPI) software have beendeveloped recently. Message buffers have usually been restricted to arrayswith elements of primitive type. We discuss adoption of the Java object serialization model for marshaling general communication data in MPI-like APIs, This approach is compared with a Java transcription of the standard MPI derived datatype mechanism. We describe an implementation of the mpiJava interface to MPI that incorporates automatic object serialization. Benchmark results confirm that current JDK implementations of serialization are not fast enough for high performance messaging applications. Means of solving this problem are discussed, and benchmarks for greatly improved schemes are presented. }
}

@Article{g-l00:mpi-app,
author = {A. J. Garcia-Loureiro and T. F. Pena and J. M. Lopez-Gonzalez and L. Prat},
title = {Parallel finite element method to solve the 3{D} {P}oisson equation and its application to abrupt heterojunction bipolar transistors},
journal = {International Journal for Numerical Methods in Engineering},
year = 2000,
volume = 49,
number = 5,
pages = {639--652},
month = OCT,
abstract = {In this work we present a parallel solver for the Poisson equation for 3D abrupt heterojunction bipolar transistors (HBT). Three-dimensional simulation is essential for studying devices of small geometry as in the case we have studied. We have used an unstructured tetrahedral mesh and we have applied the finite method clement (FEM), making a specific formulation for the nodes located on the interface of the regions with different characteristics.For WET devices, it is necessary to take into account that on both sides of the interface between the different regions exist materials with different properties. Our formulation implies situating pairs of nodes in the same physical positions of the interface, associating each nodes to a region of the HBT. This way, the effects due to thermionic emission and the tunnel effect may be simulated when the Poisson and the electron and hole equations are solved in an abrupt HBT. We have applied domain decomposition methods to solve the associate linear systems. This code has been implemented for distributed memory multicomputers, making use of a message passing standard library, MPI.}
}


@Article{sch00:mpi-app,
author = {W. Schneider and P. J. McCarthy and K. Lackner and O. Gruber and K. Behler and P. Martin and R. Merkel},
title = {{ASDEX} Upgrade {MHD} equilibria reconstruction on distributed workstations},
journal = {Fusion Engineering and Design},
year = 2000,
volume = 48,
number = {1--2},
pages = {127--134},
month = AUG,
abstract = {The identification of MHD equilibrium states on the ASDEX Upgrade tokamak is a prerequisite for interpreting measurements from a wide range of diagnostics which are correlated with the: shape of the plasma. The availability in realtime of plasma parameters related to the MHD state is crucial for controlling the experiment. Function Parameterization is used as a standard tool to determine the position, shape, and other global parameters of the plasma as well as the MHD equilibrium flux surfaces. The recently developed interpretive equilibrium code CLISTE now enables the calculation of MHD equilibria on an intershot timescale. These calculations are parallelized by the use of a Message Passing Interface (MPI).}
}

@Article{ave00:mpi-app,
author = {A. Averbuch and B. Epstein and L. Ioffe and I. Yavneh},
title = {Efficient parallelization of a three-dimensional {N}avier-{S}tokes solver on {MIMD} multiprocessors},
journal = {Journal of Supercomputing},
year = 2000,
volume = 17,
number = 2,
pages = {123--142},
month = SEP,
abstract = {The 3-D Navier-Stokes solver was implemented on three MIMD message-passing multiprocessors (a 64-processors IBM SP2, a 20-processors MOSIX, and a 64-processors Origin 2000). The same code written with PVM and MPI software packages was executed on all the above distinct computational platforms. The examples in the paper demonstrate that we can achieve efficiency of about 60\% for as many as 64 processors on Origin 2000 on a full-size 3-D aerodynamic problem which is solved on realistic computational grids.}
}


@Article{vNie00:rmi-grid,
author = {R. van Nieuwpoort and J.Maassen and H. E. Bal and T. Kielmann and R. Veldema},
title = {Wide-area parallel programming using the remote method invocation model},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 8,
pages = {643--666},
month = JUL,
annote = {Special Issue?},
abstract = {Java's support for parallel and distributed processing makes the language attractive for metacomputing applications, such as parallel applications that run on geographically distributed (wide-area) systems. To obtain actual experience with a Java-centric approach to metacomputing, we have built and used a highperformance wide-area Java system, called Manta, Manta implements the Java Remote Method Invocation (RMI) model using different communication protocols (active messages and TCP/IP) for different networks. The papershows how wide-area parallel applications can be expressed and optimized using Java RMI, Also, it presents performance results of several applications on a wide-area system consisting of four Myrinet-based clusters connected by ATM WANs, We finally discuss alternative programming models, namely object replication, JavaSpaces, and MPI for Java,}
}


@Article{pha00:mpi-app,
author = {S. Phadke and D. Bhardwaj and S. K. Dey},
title = {An explicit predictor-corrector solver with application to seismic wave modelling},
journal = {Computers \& Geosciences},
year = 2000,
volume = 26,
number = {9--10},
pages = {1053--1058},
month = {Nov.-Dec.},
abstract = {Wave-equation-based forward modelling using explicit finite-difference methods is a standard technique for calculating synthetic seismograms. The stability criterion restricts the size of the time step. In this paper a predictor-corrector method for solving the wave equation is described which allows the use of a larger time step. A stability analysis of the method is alsocarried out. Parallel implementation of the algorithm is described for a distributed computing environment which makes use of MPI and PVM message passing calls for communication between processors.}
}

@Article{oli00:mpi-app-compare,
author = {L, Oliker and R. Biswas},
title = {Parallelization of a dynamic unstructured algorithm using three leading programming paradigms},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2000,
volume = 11,
number = 9,
pages = {931--940},
month = SEP,
abstract = {The success of parallel computing in solving real-life computationally intensive problems relies on their efficient mapping and execution on large-scale multiprocessor architectures. Many important applications are both unstructured and dynamic in nature, making their efficient parallel implementation a daunting task. This paper presents the parallelization of a dynamic unstructured mesh adaptation algorithm using three popular programming paradigms on three leading supercomputers. We examine an MPI message-passing implementation on the Cray T3E and the SGI Origin2000, a shared-memory implementation using the cache coherent nonuniform memory access (CC-NUMA) feature of the Origin2000, and a multithreaded version on the newly released Tera Multithreaded Architecture (MTA). We compare several critical factors of this parallel code development, including runtime, scalability, programmability, portability, and memory overhead. Our overall results demonstrate that multithreaded systems offer tremendous potential for quickly and efficiently solving some of the most challenging real-life problems on parallel computers.}
}


@Article{pro00:mpi-impl,
author = {B. V. Protopopov and A. Skjellum},
title = {Shared-memory communication approaches for an {MPI} message-passing library},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 9,
pages = {799-820},
month = AUG,
abstract = {The contributions of this paper are three-fold. First, the authors present the taxonomy for shared-memory communication devices. Second, they show advantages and potential problems of the devices that belong to different classes of their taxonomy using the formulated design criteria. Third, they analyze communication performance of existing MPICH shared-memory devices, discuss optimizations of their performance, and show the performance gains that these optimizations yield. MPICH is used for comparison, since it is a widely used MPI implementation.}
}


@Article{dec00:mpi-app,
author = {T. Decker},
title = {Virtual data space - load balancing for irregular applications},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = {13--14},
pages = {1825--1860},
month = DEC,
abstract = {Load balancing is a key issue in the development of parallel algorithms with irregular structures. Existing load balancing systems each support only one specific programming paradigm and thus are of limited use. The system VDS presented here allows concurrent use of various paradigms such as fork-join, weighted tasks, and static dags (directed acyclic graphs that are knownin advance). The system provides visual performance evaluation tools to facilitate the efficient application of the system. VDS supports various communication interfaces including PVM and MPI. Thus, VDS-applications can be run on architectures ranging from workstation clusters to massively parallelsystems.}
}

@Article{duan00:mpi-app,
author = {S. Duan and K. S. Anderson},
title = {Parallel implementation of a low order algorithm for dynamics of multibody systems on a distributed memory computing system},
journal = {Engineering with Computers},
year = 2000,
volume = 16,
number = 2,
pages = {96--108},
abstract = {In this paper, a new hybrid parallelisable low order algorithm, developed by the authors for multibody dynamics analysis, is implemented numerically on a distributed memory parallel computing system. The presented implementation can currently accommodate the general spatial motion of chain systems, but key issues for its extension to general tree and closed loop systems are discussed. Explicit algebraic constraints are used to increase coarse grain parallelism, and to study the influence of the dimension of system constraint load equations on the computational efficiency of the algorithm for real parallel implementation using the Message Passing Interface (MPI). The equation formulation parallelism and linear system solution strategies which are used to reduce communication overhead are addressed. Numerical results indicate that the algorithm is scalable, that significant speed-up can beobtained, and that a quasi-logarithmic relation exists between time neededfor a function call and numbers of processors used. This result agrees well with theoretical performance predictions. Numerical comparisons with results obtained from independently developed analysis codes have validated thecorrectness of the new hybrid parallelisable low order algorithm, and demonstrated certain computational advantages.}
}

@Article{nam00:mpi-app,
author = {A. Namazifard and I. D. Parsons},
title = {An {MPI} parallel implementation of {N}ewmark's method},
journal = {Computer-Aided Civil and Infrastructure Engineering},
year = 2000,
volume = 15,
number = 3,
pages = {189--195},
month = MAY,
abstract = {The standard message-passing interface (MPI) is used to parallelize Newmark's method. The linear matrix equation encountered at each time step is solved using a preconditioned conjugate gradient algorithm. Data are distributed over the processors of a given parallel computer on a degree-of-freedom basis; this produces effective load balance between the processors and leads to a highly parallelized code. The portability of the implementation of this scheme is tested by solving some simple problems on two different machines: an SGI Origin2000 and an IBM SP2. The measured times demonstrate the efficiency of the approach and highlight the maintenance advantages that arise from using a standard parallel library such as MPI.}
}

@Article{chp00:prgm-devlp,
author = {B. Chapman and J. Merlin and D. Pritchard and F. Bodin and Y. Mevel and T. Sorevik and L. Hill},
title = {Program development tools for clusters of shared memory multiprocessors},
journal = {Journal of Supercomputing},
year = 2000,
volume = 17,
number = 3,
pages = {311--322},
month = NOV,
abstract = {Applications are increasingly being executed on computational systems that have hierarchical parallelism. There are several programming paradigms which may be used to adapt a program for execution in such an environment. In this paper, we outline some of the challenges in porting codes to such systems, and describe a programming environment that we are creating to support the migration of sequential and MPI code to a cluster of shared memory parallel systems, where the target program may include MPI, OpenMP or both. As part of this effort, we are evaluating several experimental approaches to aiding in this complex application development task.}
}

@Article{getov00:mpi-java,
author = {V. S. Getov and P. A. Gray and V. S. Sunderam},
title = {Aspects of portability and distributed execution for {JNI}-wrapped message passing libraries},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1039--1050},
month = SEP,
abstract = {This paper discusses an approach which aims to provide legacy message passing libraries with Java-like portability in a heterogeneous, metacomputing environment, The results of such portability permit distributed computing components to be 'soft loaded' or 'soft-installed' in a dynamic fashion, ontocooperating resources for concurrent, synchronized parallel execution. This capability provides researchers with the ability to tap into a much larger resource pool and to utilize highly tuned codes for achieving performance, Necessarily, the Java programming language is a significant component. The Java Native Interface (JNI) is used to wrap message passing libraries written in other languages, and the bytecode which is generated for the front-end may be analyzed in order to completely determine the needs of the code which it wraps, This characterization allows the pre-configuration of a remote environment so as to be able to support execution. The usefulness of the portability gained by our approach is illustrated through examples showing the soft-installation of a process using an MPI computational substrate and the soft-installation of a process which requires a C-based communication library based upon the efficient multi-cast communication package, CCTL, The examples show that significant gains in performance can be achieved while allowing message passing execution to still, exhibit high levels of portability.}
}

@Article{smith00:mpi-openmp,
author = {L. Smith and P. Kent},
title = {Development and performance of a mixed {OpenMP/MPI} quantum {M}onte {C}arlo code},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 12,
pages = {1121--1129},
month = OCT,
abstract = {The code has been rewritten to allow for an arbitrary mix of OpenMP and MPIparallelism. The various issues which arose during the parallelization arediscussed. The performance of the mixed OpenMP/MPI code has been assessed on an SGI Origin 2000 system and the results compared and contrasted to theoriginal MPI version.}
}

@Article{hotta00:mpi-app,
author = {A. Hotta and H. Ninokata and A. J. Baratta},
title = {Development of parallel coupling system between three-dimensional nodal kinetic code {ENTREE} and two-fluid plant simulator {TRAC/BF1}},
journal = {Journal of Nuclear Science and Technology},
year = 2000,
volume = 37,
number = 10,
pages = {840--854},
month = OCT,
abstract = {The high-speed three-dimensional neutron kinetic code ENTREE: was developedbased on the polynomial and semi-analytical nonlinear iterative nodal methods (PNLM and SANLM) with also introducing the discontinuity factor. In order to enhance the efficiency of transient calculation, the nonlinear correction-coupling coefficients are intermittently updated based on the changingrate of core state variables. By giving the analytical form for two-node problem matrix elements, the additional computing time in SANLM was minimized. A fast algorithm was developed for the multi table macro-cross section rebuilding process. The reactivity component model was implemented based on the variation of the neutron production and destruction terms. The code wascoupled with the two-fluid thermal hydraulic plant simulator TRAC/BF1 through PVM or MPI protocols. Two codes are executed in parallel with exchanging the feedback parameters explicitly. Based on the LMW PWR transient benchmark, it was shown that bath PNLM and SANLM spend less than 20\% excess computing time in comparison with the coarse mesh finite difference method (CFDM). The implementation of the discontinuity factor was verified based on theDVP problem. Adequacy and parallel efficiency of the coupling system TRAC/BF1-ENTREE was demonstrated based on the BWR cold water injection transientproposed by NEA/CRP.}
}

@Article{silva00:mpi-java,
author = {L. M. Silva and P. Martins and J. G. Silva},
title = {Heterogeneous parallel computing using {Java} and {WMPI}},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1077-1091},
month = SEP,
abstract = {In this paper, we present briefly the implementation of a Java interface for WMPI, a Windows-based implementation of MPI, Then, we describe a system that is oriented for Web-based computing and present a solution to integrateWMPI with this tool by making use of a Java bridge component and the Java bindings for WMPI, This solution allows the execution of meta-applications over a mixed configuration of platforms, execution models and programming languages. The resulting system provides a way to solve the problem of heterogeneity and to unleash the potential of diverse computational resources and programming tools.}
}


@Article{thir00:mpi-impl,
author = {G. K. Thiruvathukal and P. M. Dickens and S. Bhatti},
title = {Java on networks of workstations ({JavaNOW}): a parallel computing framework inspired by {Linda} and the {M}essage {P}assing {I}nterface ({MPI})},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1093--1116},
month = SEP
}

@Article{thir00:mpi-java,
author = {G. K. Thiruvathukal and P. M. Dickens and S. Bhatti},
title = {Java on networks of workstations ({JavaNOW}): a parallel computing framework inspired by {Linda} and the {M}essage {P}assing {I}nterface ({MPI})},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1093--1116},
month = SEP,
Abstract = {JavaNOW provides a simple yet powerful framework for performing computationon networks of workstations. In addition to the Linda memory model, it provides for shared objects, implicit multithreading, implicit synchronization, object dataflow, and collective communications similar to those defined in MPI. JavaNOW is also a component of the Computational Neighborhood, a Java enabled suite of services for desktop computational sharing. The intent of JavaNOW is to present an environment for parallel computing that is both expressive and reliable and ultimately can deliver good to excellent performance. As JavaNOW is a work in progress, this article emphasizes the expressive potential of the JavaNOW environment and presents preliminary performance results only.}
}

@Article{carp00:mpi-java,
author = {B. Carpenter and V. Getov and G. Judd and A. Skjellum and G. Fox},
title = {{MPJ: MPI}-like message passing for {Java}},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1019--1038},
month = SEP,
abstract = {Recently, there has been a lot of interest in using Java for parallel programming. Efforts have been hindered by lack of standard Java parallel programming APIs, To alleviate this problem, various groups started projects to develop Java message passing systems modelled on the successful Message Passing Interface (MPI), Official MPI bindings are currently defined only for C, Fortran, and C++, so early MPI-Iike environments for Java have been divergent. This paper relates an effort undertaken by a working group of the Java Grande Forum, seeking a consensus on an MPI-like API, to enhance the viability of parallel programming using Java.}
}


@Article{wall00:mpi-openmp,
author = {A. J. Wallcraft},
title = {{SPMD OpenMP} versus {MPI} for ocean models},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 12,
pages = {1155-1164},
month = OCT,
Abstract = {OpenMP can be used in Single Program Multiple Data (SPMD) mode by spawning N threads in the main program and having each thread act from then on similarly to a process in MPI. The initial port of one ocean model to SPMD OpenMP revealed several incompatibilities between thread-based and process-basedSPMD coding styles. Adding support for threaded I/O was particularly painful, requiring modification to hundreds of lines of code. Several relativelyminor additions to the OpenMP API were identified that would greatly simplify SMPD programming. Meanwhile, an alternative Fortran compiler-based SPMDAPI, Go-Array Fortran, became available on the Cray T3E, There is a simplemapping from SHMEM put/get library calls onto co-array assignment statements, so adding Go-Array Fortran support to the ocean models was straightforward, To extend Go-Array Fortran to machines other than the Cray T3E, a subset of the language is automatically translated into SPMD OpenMP via a nawk script. The performance of the 'native' OpenMP and translated Go-Array Fortran versions of the ocean model was virtually identical, so the former has been replaced by the latter (which is much easier to maintain)}
}

@Article{qia00:mpi-app,
author = {J. Qiang and R. D. Ryne and S. Habib},
title = {Fortran implementation of object-oriented design in parallel beam dynamics simulations},
journal = {Computer Physics Communications},
year = 2000,
volume = 133,
number = 1,
pages = {18--33},
month = DEC,
abstract = {In this paper, an object-oriented design for parallel beam transport simulations in accelerators is implemented using Fortran 30 (F90) with Message Passing interface (MPI) and High Performance Fortran (HPF). This improves themaintainability, reusability. and extensibility of software, combined withthe high performance of using MPI and the ease of parallel programming provided by HPF. The overhead associated with the object-oriented implementation has only a minor effect on performance.}
}


@Article{hu00:openmp,
author = {Y. C. Hu and H. H. Lu and A. L. Cox and W. .Zwaenepoel},
title = {{OpenMP} for networks of {SMP}s},
journal = {Journal of Parallel and Distributed Computing},
year = 2000,
volume = 60,
number = 12,
pages = {1512--1530},
month = DEC,
abstract = {We present performance results for seven applications (Barnes-Hut, CLU, andWater from SPLASH-2, 3D-FFT from NAS, Red-Black SOR, TSP, and MGS) runningon an SP2 with four four-processor SMP nodes. A comparison between the thread implementation and the original implementation of TreadMarks shows thatusing the hardware shared memory within an SMP node significantly reduces the amount of data and the number of messages transmitted between nodes andconsequently achieves speedups that are up to 30\% better than the originalversions. We also compare SDSM against message passing. Overall, the speedups or multithreaded TreadMarks programs are within 7-30\% of the MPI versions.}
}

@Article{kry01:mpi-app,
author = {P. Krysl and Z. Bittnar},
title = {Parallel explicit finite element solid dynamics with domain decomposition and message passing: dual partitioning scalability},
journal = {Computers and Structures},
year = 2001,
volume = 79,
number = 3,
pages = {345--360},
month = JAN,
Abstract = {We document not only the high-level algorithms but also the relevant communication code fragments of the message passing implementation using the MPI library, so as to empower the reader to fully verify our numerical experiments.}
}

@Article{leg00:mpi-applibs,
author = {P. F. Leggett and S. P. Johnson and M. Cross},
title = {{CAPLib} - a `thin layer' message passing library to support computational mechanics codes on distributed memory parallel systems},
journal = {Advances in Engineering Software},
year = 2000,
volume = 32,
number = 1,
pages = {61--83},
month = DEC
}

@Article{sad01:mpi-app,
author = {M. Sadeghi and F. Liu},
title = {Computation of mistuning effects on cascade flutter},
journal = {AIAA Journal},
year = 2001,
volume = 39,
number = 1,
pages = {22--28},
month = JAN,
Abstract = {A computational method is described for predicting Butter of turbomachinerycascades with mistuned blades. The method solves the unsteady Euler/Navier-Stokes equations for multiple-blade passages on a parallel computer using the message passing interface. A secund-order implicit scheme with dual time-stepping and multigrid is used. Each individual blade is capable of moving with its own independent frequeucy and phase angle, thus modeling a cascade with mistuned blades. Flutter predictions are performed through the energy method, Both phase-angle and frequency mistuning are studied, It is found that phase-angle mistuning has little effect on stability, whereas frequency mistuning significantly changes the aerodynamic damping, The important effect of frequency mistuning is to average out the aerodynamic damping of the tuned blade row over the whole range of interblade phase angles (IBPA).If a tuned blade row is stable over most of the IBPA range, the blades canbe stabilized fur the complete IBPA range through appropriate frequency mistuning.}
}


@Article{gull01:mpi-app,
author = {A. S. Gullerud and R. H. Dodds},
title = {{MPI}-based implementation of a {PCG} solver using an {EBE} architecture and preconditioner for implicit, 3-{D} finite element analysis},
journal = {Computers and Structures},
year = 2001,
volume = 79,
number = 5,
pages = {553--575},
month = FEB,
Abstract = {This work describes a coarse-grain parallel implementation of a linear preconditioned conjugate gradient solver using an element-by-element architecture and preconditioner for computation. The solver, implemented within a nonlinear. implicit finite element code, uses an MPI-based message-passing approach to provide portable parallel execution on shared, distributed, and distributed-shared memory computers. The flexibility of the element-by-element approach permits a dual-level mesh decomposition; a coarse, domain-level decomposition creates a load-balanced domain for each processor for parallel computation, while a second level decomposition breaks each domain into blocks of similar elements (same constitutive model- order of integration, element type) for fine-grained parallel computation on each processor. The key contribution here is a new parallel implementation of the Hughes-Winget (HW) element-by-element preconditioner suitable for arbitrary, unstructuredmeshes. The implementation couples an unstructured dependency graph with anew balanced graph-coloring algorithm to schedule parallel computations within and across domains. The code also includes the diagonal preconditionerand a modern parallel (threaded) sparse direct solver for comparison, Three example problems with up to 158,000 elements and 180,000 nodes analyzed on an SGI/Cray Origin 2000 illustrate the parallel performance of the algorithms and preconditioners, Analyses with varying block sizes illustrate thatthe two-level decomposition improves overall execution speed with the block size tuned for the cache memory architecture of the executing platform. This implementation of the HW preconditioner shows reasonable parallel efficiency - typically 80\%, on 48 processors. Efficiency for the diagonal preconditioner is also high, with total speedups reaching 86\% on 48 CPUs. Calculation of the tangent element stiffnesses shows superlinear speedups for each of the test problems, while the computation of strains/stresses/residual forces shows 80\% parallel efficiency on 48 processors.}
}



@Article{scot01:mpi-app,
author = {J. A. Scott},
title = {A parallel frontal solver for finite element applications},
journal = {International Journal for Numerical Methods in Engineering},
year = 2001,
volume = 50,
number = 5,
pages = {1131--1144},
month = FEB,
Abstract = {In finite element simulations, the overall computing time is dominated by the time needed to solve large sparse linear systems of equations. We reporton the design and development of a parallel frontal code that can significantly reduce the wallclock time needed for the solution of these systems. The algorithm used is based on dividing the finite element domain into subdomains and applying the frontal method to each subdomain in parallel. The so-called multiple front approach is shown to reduce the amount of work and memory required compared with the frontal method and, when run on a small number of processes, achieves good speedups. The code, HSL_MP42, has been developed for the Harwell Subroutine Library (http://www.numerical.rl.ac.uk/hsl). It is written in Fotran 90 and, by using MPI for message passing, achieves portability across a wide range of modem computer architectures.}
}

@Article{alta01:mpi-eval,
author = {K. Al-Tawil and C. A. Moritz},
title = {Performance modeling and evaluation of {MPI}},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 2,
pages = {202--223},
abstract = {Users of parallel machines need to have a good grasp for how different communication patterns and styles affect the performance of message-passing applications. LogGP is a simple performance model that reflects the most important parameters required to estimate the communication performance of parallel computers. The message passing interface (MPI) standard provides new opportunities for developing high performance parallel and distributed applications. In this paper, we use LogGP as a conceptual framework for evaluating the performance of MPI communications on three platforms: Gray-Research T3D, Convex Exemplar 1600SP, and a network of workstations (NOW). We developa simple set of communication benchmarks to extract the LogGP parameters. Our objective in this is to compare the performance of MPI communication onseveral platforms and to identify a performance model suitable for MPI performance characterization. In particular, two problems are addressed: how LogGP quantifies MPI performance and what extra features are required for modeling MPI, and how MPI performance compare on the three computing platforms: Gray Research T3D, Convex Exemplar 1600SP, and workstations clusters.}
}

@Article{grif00:mpi-app,
author = {L. W. Griffin and D. J. Dorney},
title = {Simulations of the unsteady flow through the Fastrac supersonic turbine},
journal = {Journal of Turbomachinery-Transactions of the ASME},
year = 2000,
volume = 122,
number = 2,
pages = {225--233},
month = APR,
abstract = {Analysis of the unsteady aerodynamic environment in the Fastrac supersonic turbine is presented. Model analysis of the turbine blades indicated possible resonance in crucial operating ranges of the turbopump. Unsteady computational fluid dynamics (CFD) analysis was conducted to support the aerodynamic and structural dynamic assessments of the turbine. Before beginning the analysis, two major problems with current unsteady analytical capabilities had to be addressed: modeling a straight centerline nozzle with the turbineblades and exit guide vanes (EGVs), and reducing run times significantly while maintaining physical accuracy. Modifications were made to the CFD codeused in this study to allow the coupled nozzle/blade/EGV analysis and to incorporate Message Passing Interface (MPI) software. Because unsteadiness is a key issue for the Fastrac turbine [and future rocket engine turbines such as the Reusable Launch Vehicle (RLV)], calculations were performed for two nozzle-to-blade axial gaps. Calculations were also performed for the nozzle alone, and the results were imposed as an inlet boundary condition for a blade/EGV calculation for the large gap case. These results are compared to the nozzle/blade/EGV results.}
}

@Article{des01:mpi-app,
author = {J. C. Desplat and I. Pagonabarraga and P. Bladon},
title = {{LUDWIG: A} parallel {L}attice-{B}oltzmann code for complex fluids},
journal = {Computer Physics Communications},
year = 2001,
volume = 134,
number = 3,
pages = {273--290},
month = MAR,
Abstract = {This paper describes Ludwig, a versatile code for the simulation of Lattice-Boltzmann (LB) models in 3D on cubic lattices. In fact, Ludwig is not a single code, but a set of codes that share certain common routines, such as I/O and communications. If Ludwig is used as intended, a variety of complex fluid models with different equilibrium free energies are simple to code, so that the user may concentrate on the physics of the problem, rather than on parallel computing issues. Thus far, Ludwig's main application has been to symmetric binary fluid mixtures. We first explain the philosophy and structure of Ludwig which is argued to be a very effective way of developing large codes for academic consortia. Next we elaborate on some parallel implementation issues such as parallel I/O, and the use of MPI to achieve full portability and good efficiency on both MPP and SMP systems. Finally, we describe how to implement generic solid boundaries, and look in detail at the particular case of a symmetric binary fluid mixture near a solid wall. We present a novel scheme for the thermodynamically consistent simulation of wetting phenomena, in the presence of static and moving solid boundaries, andcheck its performance.}
}

@Article{tan00:mpi-impl,
author = {H. Tang and K. Shen and T. Yang},
title = {Program transformation and runtime support for threaded {MPI} execution on shared-memory machines},
journal = {ACM Transactions on Programming Languages and Systems},
year = 2000,
volume = 22,
number = 4,
pages = {673--700},
month = JUL,
Abstract = {Parallel programs written in MPI have been widely used for developing high-performance applications on various platforms. Because of a restriction of the MPI computation model, conventional MPI implementations on shared-memory machines map each MPI, node to an OS process, which can suffer serious performance degradation in the presence of multiprogramming, This paper studies compile-time and runtime techniques for enhancing performance portability of MPI code running on multiprogrammed shared-memory machines. The proposed techniques allow MPI nodes to be executed safely and efficiently as threads. Compile-time transformation eliminates global and static variables in C code using node-specific data. The runtime support includes an efficient and provably correct communication protocol that uses lock-free data structure and takes advantage of address space sharing among threads. The experiments on SGI Origin 2000 show that our MPI prototype called TMPI using the proposed techniques is competitive with SGI's native MPI implementation in adedicated environment, and that it has significant performance advantages in a multiprogrammed environment.}
}

@Article{dim01:mpi-app,
author = {I. Dimov and V. Alexandrov and A. Karaivanova},
title = {Parallel resolvent Monte Carlo algorithms for linear algebra problems},
journal = {Mathematics and Computers in Simulation},
year = 2001,
volume = 55,
number = {1-3},
pages = {25--35},
month = FEB,
abstract = {In this paper, we consider Monte Carlo (MC) algorithms based on the use of the resolvent matrix for solving linear algebraic problems. Estimates for the speedup and efficiency of the algorithms are presented. Some numerical examples performed on cluster of workstations using MPI are given. }
}

@Article{luCai01:mpi-app,
author = {Q. M. Lu and D. S. Cai},
title = {Implementation of parallel plasma particle-in-cell codes on {PC} cluster},
journal = {Computer Physics Communications},
year = 2001,
volume = 135,
number = 1,
pages = {93--104},
month = MAR,
Abstract = {Plasma particle-in-cell (PIC) codes model the interaction of charged particles with the surrounding fields, and they have been implemented on many advanced parallel computers. Recently, many PC clusters which consist of inexpensive PCs have been developed to do parallel computing, and we also build such a PC cluster. In this paper, we present the implementation of a parallel plasma PIC code on our PC cluster using MPI, PGHPF and JavaMPI.}
}

@Article{yas01:mpi-app,
author = {O. Yasar},
title = {A new ignition model for spark-ignited engine simulations},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = {1--2},
pages = {179--200},
month = JAN,
abstract = {The amount of spark energy deposited into the combustion chamber is key to an optimum ignition as one can end up with misfires when this energy is lowor with other undesired effects on engine performance and byproducts when it is high, Experimentally, up to now, no one has been able to correlate the combustion outcome accurately to the spark parameters in a controllable way. Theoretical investigation and computer modeling is leading to a better understanding of how spark flames propagate. A new computational approach to ignition dynamics is presented here for spark-ignited (SI) engine combustion simulations. Our computational model, using the MPI communication library, attempts to solve temporal and spatial equations of the electromagnetic(EM) equations in conjunction with the well-known Navier-Stokes equations of the standard KIVA-3 engine code. The interaction between the gas and theflame (plasma) kernel in the spark region is computed through the momentumand energy exchange between these two fields, Preliminary results show a distinct spatial distribution of physical quantities at the flame front and within the inflammation zone. A slight change in the spark discharge current has significant impact on the combustion and emissions. Enhanced accuracyof spark ignition modeling might help us better compute the early flame propagation and its influence on the cyclic variability of engines, potentially leading to design of new spark plugs. }
}


@Article{lin01:mpi-graphics,
author = {W. S. Lin and R. W. H. Lau and K. Hwang and X. L. Lin and P. Y. S. Cheung},
title = {Adaptive parallel rendering on multiprocessors and workstation clusters},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2001,
volume = 12,
number = 3,
pages = {241--258},
month = MAR,
abstract = {This paper presents the design and performance of a new parallel graphics renderer for 3D images. This renderer is based on an adaptive supersampling approach that works for time/space-efficient execution on two classes of parallel computers. Our rendering scheme takes subpixel supersamples only along polygon edges. This leads to a significant reduction in rendering time and in buffer memory requirements. Furthermore, we offer a balanced rasterization of all transformed polygons. Experimental results prove these advantages on both a shared-memory SGI multiprocessor server and a Unix cluster ofSun workstations. We reveal performance effects of the new rendering scheme on subpixel resolution, polygon number, scene complexity, and memory requirements. The balanced parallel renderer demonstrates scalable performance with respect to increase in graphic complexity and in machine size. Our parallel renderer outperforms Crow's scheme in benchmark experiments performed. The improvements are made in three fronts: 1) reduction in rendering time, 2) higher efficiency with balanced workload, and 3) adaptive to availablebuffer memory size. The balanced renderer can be more cost-effectively embedded within many 3D graphics algorithms, such as those for edge smoothing and 3D visualization. Our parallel renderer is MPI-coded, offering high portability and cross-platform performance. These advantages can greatly improve the QoS in 3D imaging and in real-time interactive graphics.}
}

@Article{got01:mpi-openmp-app,
author = {S. Gottlieb and S. Tamhankar},
title = {Benchmarking {MILC} code with {OpenMP} and {MPI}},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 2001,
number = 94,
pages = {841--845},
month = MAR,
abstract = {A trend in high performance computers that is becoming increasingly popularis the use of symmetric multiprocessing (SMP) rather than the older paradigm of MPP. MPI codes that ran and scaled well on MPP machines can often be run on an SR;IP machine using the vendor's version of MPI. However, this approach may not make optimal use of the (expensive) SMP hardware. More significantly, there are machines like Blue Horizon, an IBM SP with 8-way. SMP nodes at the San Diego Supercomputer Center that carl only support 4 MPI processes per node (with the current switch). On such a machine it is imperative to be able to use OpenMP parallelism on the node, and MPI between nodes.We describe the challenges of converting MILC MPI code to using a second level of OpenMP parallelism, and benchmarks on IBM and Sun computers.}
}


@Article{cha00:mpi-app,
author = {T. Chan and V. Eijkhout},
title = {Design of a library of parallel preconditioners},
journal = {International Journal of High Performance Computing Applications},
year = 2000,
volume = 14,
number = 2,
pages = {91--101},
month = {Summer},
abstract = {The authors outline the design principles underlying the ParPre library of parallel preconditioners. ParPre is a message-passing library of distributed preconditioners for linear systems, written using MPI and Petsc. It comprises Schwarz methods, Schur system domain decompositioning, various parallel incomplete factorizations, and multilevel methods.}
}


@Article{gro00:mpi-app,
author = {W. Gropp and D. Keyes and L. C. McInnes and M. D. Tidriri},
title = {Globalized {N}ewton-{K}rylov-{S}chwarz algorithms and software for parallel implicit {CFD}},
journal = {International Journal of High Performance Computing Applications},
year = 2000,
volume = 14,
number = 2,
pages = {102--136},
month = {Summer},
abstract = {Implicit solution methods are important in applications modeled by PDEs with disparate temporal and spatial scales. Because such applications require high resolution with reasonable turnaround, parallelization is essential. The pseudo-transient matrix-free Newton-Krylov-Schwarz (Psi NKS) algorithmicframework is presented as a widely applicable answer. This article shows that for the classical problem of three-dimensional transonic Euler flow about an M6 wing, Psi NKS can simultaneously deliver globalized, asymptotically rapid convergence through adaptive pseudo-transient continuation and Newton's method; reasonable parallelizability for an implicit method through deferred synchronization and favorable communication-to-computation scaling in the Krylov linear solver; and high per processor performance through attention to distributed memory and cache locality, especially through the Schwarz preconditioner. Two discouraging features of Psi NKS methods are their sensitivity to the coding of the underlying PDE discretization and the large number of parameters that must be selected to govern convergence. The authors therefore distill several recommendations from their experience and reading of the literature on various algorithmic components of Psi NKS, and they describe a freely available MPI-based portable parallel software implementation of the solver employed here.}
}


@Article{man01:mpi-app-perf,
author = {J. W. Manke and G. D. Kerlick and D. Levine and S. Banerjee and E. Dillon},
title = {Parallel performance of two applications in the {B}oeing high performance computing benchmark suite},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 4,
pages = {457--475},
month = MAR,
abstract = {We describe our work to evaluate the performance of the parallel versions of two floating-point-intensive engineering applications from Boeing's high performance computing benchmark suite (BHPCBS) on emerging RISC parallel systems and PC clusters. The first application is a computational fluid dynamics (CFD) code, OVERFLOW, developed by NASA. and used by Boeing for analysis and design of advanced aircraft. The second application is a prototype ofa computational electromagnetics (CEM) code, developed by Boeing and used for radar cross-section studies. The distributed memory parallel versions of both applications use the message passing interface (MPI) standard for message passing. The goal of our work was to determine whether RISC parallel systems and PC clusters, which offer high performance at low cost, may be able to meet Boeing's computing requirements in the future. We describe the test environments for the studies, discuss parallelization issues and strategies and present performance data for the two applications.}
}


@Article{bag01:mpi-perf,
author = {R. Bagrodia and E. Deelman and T. Phan},
title = {Parallel simulation of large-scale parallel applications},
journal = {International Journal of High Performance Computing Applications},
year = 2001,
volume = 15,
number = 1,
pages = {3--12},
month = {Spring},
abstract = {Accurate and efficient simulation of large parallel applications can be facilitated with the use of direct execution and parallel discrete-event simulation. This paper describes MPI-SIM, a direct execution-driven parallel simulator designed to predict the performance of existing MPI and MPI-IO application. MPI-SIM can be used to predict the performance of these programs asa function of architectural characteristics, including number of processors, message communication latencies, caching algorithms, and alternative implementations of collective I/O operations. Results are presented, which show the use of MPI-SIM in performing a scalability study of real-world applications. The benchmarks chosen for the study include Sweep3D, one of the ASCI benchmarks, and BTIO, an I/O-intensive benchmark from the NAS Parallel Benchmark suite. MPI-SIM is shown to accurately and efficiently predict the performance of Sweep3D running on an Origin 2000. It is also used to demonstrate the impact of the number of I/O nodes on BTIO's performance.}
}


@Article{hoe01:mpi-openmp,
author = {J. Hoeflinger and P. Alavilli and T. Jackson and B. Kuhn},
title = {Producing scalable performance with {OpenMP}: {E}xperiments with two {CFD} applications},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 4,
pages = {391--413},
month = MAR,
abstract = {OpenMP is a relatively new programming paradigm, which can easily deliver good parallel performance for small numbers ($<16$) of processors. Success with more processors is more difficult to produce. MPI is a relatively mature programming paradigm, and there have been many reports of highly scalable MPI codes for large numbers (hundreds, even thousands) of processors. In this paper, we explore the causes of poor scalability with OpenMP from two points of view. First, we incrementally transform the loops in a combustion application until we achieve reasonably good parallel scalability, and chronicle the effect of each step. Then, we approach scalability from the other direction by transforming a highly scalable program simulating the core flowof a solid-fuel rocket engine (originally written with MPI calls) directlyto OpenMP, and report the barriers to scalability that were detected. The list of incremental transformations includes well-known techniques such as loop interchange and loop fusion, plus new ones which make use of the unique features of OpenMP, such as barrier removal and the use of ordered serialloops. The list of barriers to scalability includes the use of the ALLOCATE statement within a parallel region, as well as the lack of a reduction clause for a PARALLEL region in OpenMP. We conclude with a list of key issueswhich need to be addressed to make OpenMP a more easily scalable paradigm.Some of these are OpenMP implementation issues; some are language issues.}
}

@Article{wal01:mpi-app,
author = {R. L. Walker},
title = {Search engine case study: searching the web using genetic programming and {MPI}},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 1,
pages = {71--89},
month = JAN,
abstract = {The generation of a Web page follows distinct sources for the incorporationof information. The earliest format of these sources was an organized display of known information determined by the page designers' interest and/or design parameters. The sources may have been published in books or other printed literature, or disseminated as general information about the page designer. Due to a growth in Web pages, several new search engines have been developed in addition to the refinement of the already existing ones. The use of the refined search engines, however, still produces an array of diverse information when the same set of keywords are used in a Web search. Some degree of consistency in the search results can be achieved over a period of time when the same search engine is used, yet, most initial Web searches on a given topic are treated as final after some form of refinement/adjustment of the keywords used in the search process. To determine the applicability of a genetic programming (GP) model for the diverse set of Web documents, search strategies behind the current search engines for the World Wide Web were studied. The development of a GP model resulted in a parallel implementation of a pseudosearch engine indexer simulator. The training sets used in this study provided a small snapshot of the computational effort required to index Web documents accurately and efficiently. Future results will be used to develop and implement Web crawler mechanisms that are capable of assessing the scope of this research effort, The GP model results were generated on a network of SUN workstations and an IBM SP2.}
}


@Article{dij01:mpi-app,
author = {F. Dijkstra and J. H. van Lenthe},
title = {Software news and updates - Parallel valence bond},
journal = {Journal of Computational Chemistry},
year = 2001,
volume = 6,
number = 22,
pages = {665--672},
month = APR,
abstract = {A parallel version of the valence bond program TURTLE has been developed. In this version the calculation of matrix elements is distributed over the processors. The implementation has been done using the message-passing interface (MPI), and is, therefore, portable. The parallel version of the program is shown to be quite efficient with a speed-up of 55 at 64 processors.}
}


@Article{den01:mpi-sys,
author = {Y. F. Deng and A. Korobka},
title = {The performance of a supercomputer built with commodity components},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = {1--2},
pages = {91--108},
month = JAN,
abstract = {We built a supercomputer called Galaxy by connecting Intel Pentium-based computer nodes with Fast and Gigabit Ethernet switches. Each node has two processors at clock speeds varying from 300 to 600 MHz, up to 512 MB of memory, and small 2 Gb local disk. All nodes run the standard RedHat Linux and inter-node communication is handled by a message passing interface called MPI. Local tools are written to visualize the system performance and to balance loads. We have benchmarked a sub-Galaxy with 72 processors by NAS and Parallel LINPACK benchmark suites. We achieved 16.9 Gflops in a standard single precision LU decomposition for 46848 x 46838 matrix parallel LINPACK benchmark. A Galaxy with 128 processors costs approximately \$250 000 and it delivers 40 Gflops of performance. This leads to a cost-performance ratio of 160 Kflops-per-dollar, which is to improve further due to increase in processor speeds and network bandwidth at similar cost. Our final system with 512 processors is expected to reach several Tflops. This article first describes the Galaxy architectural details, and then present and analyze its performance in terms of floating point number crunching, network bandwidth, and IO throughput.}
}


@Article{cap01:mpi-smp-perf,
author = {F. Cappello and O. Richard and D. Etiemble},
title = {Understanding performance of {SMP} clusters running {MPI} programs},
journal = {Future Generation Computer Systems},
year = 2001,
volume = 17,
number = 6,
pages = {711-720},
month = APR,
abstract = {Clusters of multiprocessors (CLUMPs) have an hybrid memory model, with message passing between nodes and shared memory inside nodes. We examine the performance of Myrinet clusters of SMP PCs when using a single memory model (SMM) based on the MPICH-PM/CLUMP library of the RWCP, which can directly use the MPI programs written for a cluster of uniprocessors. The specificities of the communication patterns with the SMM approach are detailed. PC clusters with 2-way and 4-way nodes are considered and compared.}
}


@Article{he01:mpi-app,
author = {Y. He and C. H. Q. Ding},
title = {Using accurate arithmetics to improve numerical reproducibility and stability in parallel applications},
journal = {Journal of Supercomputing},
year = 2001,
volume = 18,
number = 3,
pages = {259--277},
month = MAR,
abstract = {Numerical reproducibility and stability of large scale scientific simulations, especially climate modeling, on distributed memory parallel computers are becoming critical issues. In particular, global summation of distributedarrays is most susceptible to rounding errors, and their propagation and accumulation cause uncertainty in final simulation results. We analyzed several accurate summation methods and found that two methods are particularly effective to improve (ensure) reproducibility and stability: Kahan's self-compensated summation and Bailey's double-double precision summation. We provide an MPI operator MPI_SUMDD to work with MPI collective operations to ensure a scalable implementation on large number of processors. The final methods are particularly simple to adopt in practical codes: not only global summations, but also vector-vector dot products and matrix-vector or matrix-matrix operations.}
}

@Article{pro01:mpi-impl,
author = {B. V. Protopopov and A. Skjellum},
title = {A multithreaded message passing interface ({MPI}) architecture: Performance and program issues},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 4,
pages = {449--466},
month = APR,
Abstract = {This paper discusses a multithreaded software architecture for message-passing interface (MPI) software specification. The architecture is thread-safe, allows for concurrent communication over several communications media (multifabric communication), efficiently utilizes available hardware concurrency over a wide range of target platforms, and allows for concurrent communication and computation within the limits imposed by the hardware. The architecture is developed in the framework of the MPICH software architecture, awell-known MPI implementation used worldwide. The proposed architecture adopts wide portability of the MPICH design and remedies some of its deficiencies such as inefficient multifabric communication and non-thread-safety. The paper also considers the issues concerning development of high-performance portable message-passing systems for general-purpose architectures. The contributions of the paper are improving architecture and addressing threadsafely of modern reliable messaging software, as well as identifying and taking advantage of inherent concurrency in the message-passing software itself.}
}


@Article{cun01:mpi-app,
author = {R. D. da Cunha and A. L. de Bortoli},
title = {A parallel {N}avier-{S}tokes solver for the rotating flow problem},
journal = {Concurrency and Computation-Practice \& Experience},
year = 2001,
volume = 13,
number = 3,
pages = {163--180},
month = MAR,
abstract = {In this paper, we investigate the parallel solution of rotating internal flow problems, using the Navier-Stokes equations as proposed by Speziale and Thangam (in 1983) and Speziale (in 1985), A Runge-Kutta time-stepping scheme was applied to the equations and both sequential and message-passing implementations were developed, the latter using MPI, and were tested on a four-processor SGI Origin200 distributed, global shared memory parallel computer and on a 32-processor IBM 9076 SP/2 distributed memory parallel computer.The results show that our approach to parallelize the sequential implementation requires little effort whilst providing good results even for medium-sized problems.}
}

@Article{swan01:mpi-app,
author = {C. A. Swann},
title = {Software for parallel computing: The {LAM} implementation of {MPI}},
journal = {Journal of Applied Econometrics},
year = 2001,
volume = 16,
number = 2,
pages = {185--194},
month = {Mar-Apr},
abstract = {Many econometric problems can benefit from the application of parallel computing techniques, and recent advances in hardware and software have made such application feasible. There are a number of freely available software libraries that make it possible to write message passing parallel programs using personal computers or Unix workstations. This review discusses one of these-the LAM (Local Area Multiprocessor) implementation of MPI (the MessagePassing Interface).}
}


@Article{reis01:mpi-app,
author = {T. G. Reisin and S. C. Wurzler},
title = {Implementation of a numerical solution of the multicomponent kinetic collection equation ({MKCE}) on parallel computers},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 5,
pages = {641--661},
month = MAY,
abstract = {Two different numerical solutions of the two-component kinetic collection equation were implemented on parallel computers. The parallelization approach included domain decomposition and MPI commands for communications. Four different parallel codes were tested. A dynamic decomposition based on an occupancy function provided the optimum balance between time performance and flexibility for ally number of processors. The occupancy function was defined according to the number of calculations required at each grid point in the domain. Speed-up performance depended very much on the parallel code used and in some cases very good results were obtained for up to 32 processors.}
}

@Article{guif01:mpi-app,
author = {C. Guiffaut and K. Mahdjoubi},
title = {A parallel {FDTD} algorithm using the {MPI} library},
journal = {IEEE Antennas and Propagation Magazine},
year = 2001,
volume = 43,
number = 2,
pages = {94--103},
month = APR,
abstract = {In this paper, we describe the essential elements of a parallel algorithm for the FDTD method using the MPI (Message Passing Interface) library. To simplify and accelerate the algorithm, an MPI Cartesian 2D topology is used. The inter-process communications are optimized by the use of derived data types. A general approach is also explained for parallelizing the auxiliary tools, such as far-field computation, thin-wire treatment, etc. For PMLs, we have used a new method that makes it unnecessary to split the field components. This considerably simplifies the computer programming, and is compatible with the parallel algorithm.}
}

@Article{yao01:mpi-app,
author = {J. X. Yao and A. Jameson and J. J. Alonso and F. Liu},
title = {Development and validation of a massively parallel flow solver for turbomachinery flows},
journal = {Journal of Propulsion and Power},
year = 2001,
volume = 17,
number = 3,
pages = {659--668},
month = {May-June},
abstract = {The development and validation of the unsteady, three-dimensional, multiblock, parallel turbomachinery how solver TFLO is presented, The unsteady Reynolds-averaged Navier-Stokes equations are solved using a cell-centered discretization on arbitrary multiblock meshes. The solution procedure is based on efficient explicit Runge-Kutta methods with several convergence acceleration techniques such as multigrid, implicit residual smoothing, and local time stepping. The solver is parallelized using domain decomposition, a single program multiple data strategy, and the message passing interface standard, Details of the communication scheme and load balancing algorithms are discussed. A general and efficient procedure for parallel interblade row interfacing is developed. The dual-time stepping technique is used to advance unsteady computations in time, The focus is on improving the parallel efficiency and scalability of the flow solver, as well as on its initial validation of steady-state calculations in multiblade row environment. The result of this careful implementation is a solver with demonstrated scalability upto 1024 processors. For validation and verification purposes, results fromTFLO are compared with both existing experimental data and computational results from other computational fluid dynamics codes used in aircraft engine industry.}
}


@Article{kom01:mpi-app,
author = {Y. Komeiji and M. Haraguchi and U. Nagashima},
title = {Parallel molecular dynamics simulation of a protein},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 8,
pages = {977-987},
month = JUL,
abstract = { Program for energetic analysis of biochemical molecules (PEACH) is a software package for molecular dynamics (MD) simulation of biological molecules. The subroutines for the nonbonded interactions were modified to allow parallel computation by using the MPI library. The parallel efficiencies of the modified subroutines were close to 90\% or better when using 32 processors of an IBM SP computer. The total performance was comparable to that of the special-purpose computer MD-GRAPE with 8 LSI chips.
}
}


@Article{mar01:mpi-style,
author = {B. Di Martino and A. Mazzeo and N. Mazzocca and U. Villano},
title = {Parallel program analysis and restructuring by detection of point-to-point interaction patterns and their transformation into collective communication constructs},
journal = {Science of Computer Programming},
year = 2001,
volume = 40,
number = {2-3},
pages = {235--263},
month = JUL,
abstract = { After the presentation of the basic program analysis technique, several examples involving the detection of common communication patterns are shown. Then the structure of PPAR, a prototype tool that allows the analysis of parallel programs written in Fortran 77 with calls to PVM or MPI unstructured communication primitives is outlined, and conclusions are drawn.}
}


@Article{lee01:mpi-app,
author = {M. Lee and W. Liu and V. K. Prasanna},
title = {Parallel implementation of a class of adaptive signal processing applications},
journal = {Algorithmica},
year = 2001,
volume = 30,
number = 4,
pages = {645--684},
month = AUG,
abstract = {In this paper we present a methodology for mapping a class of adaptive signal processing applications onto KPC platforms such that the throughput performance is optimized. We first define a new task model using the salient computational characteristics of a class of adaptive signal processing applications. Based on this task model, we propose a new execution model. In the earlier linear pipelined execution model, the task mapping choices were restricted. The new model permits flexible task mapping choices, leading to improved throughput performance compared with the previous model. Using the new model, a three-step task mapping methodology is developed. It consists of (1) a data remapping step, (2) a coarse resource allocation step, and (3)a fine performance tuning step. The methodology is demonstrated by designing parallel algorithms for modern radar and sonar signal processing applications. These are implemented on IBM SP2 and Cray T3E, state-of-the-art HPC platforms, to show the effectiveness of our approach. Experimental results show significant performance improvement over those obtained by previous approaches. Our code is written using C and the Message Passing Interface (MPI). Thus, it is portable across various HPC platforms.}
}


@Article{take01:mpi-eval,
author = {K. Takeda and N. K. Allsopp and J. C. Hardwick and P. C. Macey and D. A. Nicole and S. J. Cox and D. J. Lancaster},
title = {An assessment of {MPI} environments for windows {NT}},
journal = {Journal of Supercomputing},
year = 2001,
volume = 19,
number = 3,
pages = {315--323},
Abstract = {In this paper we evaluate the MPI environments currently available for Windows NT on the Intel IA32 and Compaq/DEC Alpha architectures. We present benchmark results for low-level communication and for the NAS Parallel Benchmarks to allow comparison with other systems, but our primary interest is determining real application performance and robustness in production cluster environments. For this we use PAFEC-FE, a large FORTRAN code for finite-element analysis. We present results from three MPI implementations, two architectures, and three networking technologies (10 and 100 Mbit/s Ethernet and 1 Gbit/s Myrinet).}
}


@Article{chun01:mpi-app,
author = {S. H. Chung and H. C. Kwon and K. R. Ryu and Y. Chung and H. Jang and C. A. Choi},
title = {Information retrieval on an {SCI}-based {PC} cluster},
journal = {Journal of Supercomputing},
year = 2001,
volume = 19,
number = 3,
pages = {251--265},
Abstract = {This article presents an efficient parallel information retrieval (IR) system which provides fast information service for the Internet users on low-cost high-performance PC-NOW environment. The IR system is implemented on a PC cluster based on the scalable coherent interface (SCI), a powerful interconnecting mechanism for both shared memory models and message-passing models. In the IR system, the inverted-index file (IIF) is partitioned into pieces using a greedy declustering algorithm and distributed to the cluster nodes to be stored on each node's hard disk. For each incoming user's query with multiple terms, terms are sent to the corresponding nodes which contain the relevant pieces of the IIF to be evaluated in parallel. The IR system is developed using a distributed-shared memory (DSM) programming technique based on the SCI. According to the experiments, the IR system outperforms anMPI-based IR system using Fast Ethernet as an interconnect. Speed-up of upto 5.0 was obtained with an 8-node cluster in processing each query on a 500,000-document IIF.}
}


@Article{pic01:mpi-app,
author = {S. M. Pickles and J. M. Brooke and F. C. Costen and E. Gabriel and M. Muller and M. Resch and S. M. Ord},
title = {Metacomputing across intercontinental networks},
journal = {Future Generation Computer Systems},
year = 2001,
volume = 17,
number = 8,
pages = {911--918},
month = JUN,
Abstract = {An intercontinental network of supercomputers spanning more than 10 000 miles and running challenging scientific applications was realized at the Supercomputing '99 (SC99) conference in Portland, OR using PACX-MPI and ATM PVCs. In this paper, we describe how we constructed the heterogeneous cluster of supercomputers, the problems we confronted in terms of multi-architecture and the way several applications handled the specific requirements of a metacomputer.}
}


@Article{sha01:mpi-model,
author = {H. Z. Shan and J. P. Singh},
title = {A comparison of {MPI}, {SHMEM} and cache-coherent shared address space programming models on a tightly-coupled multiprocessors},
journal = {International Journal of Parallel Programming},
year = 2001,
volume = 29,
number = 3,
pages = {283--318},
month = JUN,
Abstract = {We compare the performance of three major programming models on a modern, 64-processor hardware cache-coherent machine, one of the two major types of platforms upon which high-performance computing is converging. We focus on applications that are either regular, predictable or at least do not require fine-grained dynamic replication of irregularly accessed data. Within this class, we use programs with a range of important communication patterns. We examine whether the basic parallel algorithm and communication structuring approaches needed for best performance are similar or different among the models, whether some models have substantial performance advantages over others as problem size and number of processors change, what the sources ofthese performance differences are, where the programs spend their time, and whether substantial improvements can be obtained by modifying either the application programming interfaces or the implementations of the programming models on this type of tightly-coupled multiprocessor platform.}
}


@Article{dem01:mpi-extension,
author = {E. D. Demaine and I. Foster and C. Kesselman and M. Snir},
title = {Generalized communicators in the message passing interface},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2001,
volume = 16,
number = 6,
pages = {610--616},
month = JUN,
abstract = {We propose extensions to the Message Passing Interface (MPI) that generalize the MPI communicator concept to allow multiple communication endpoints per process, dynamic creation of endpoints, and the transfer of endpoints between processes. The generalized communicator construct can be used to express a wide range of interesting communication structures, including collective communication operations involving multiple threads per process, communications between dynamically created threads or processes, and object-oriented applications in which communications are directed to specific objects. Furthermore, this enriched functionality can be provided in a manner that preserves backward compatibility with MPI. We describe the proposed extensions, illustrate their use with examples, and describe a prototype implementation in the popular MPI implementation MPICH.}
}


@Article{tro01:mpi-app,
author = {R. Trobec and M. Sterk and M. Praprotnik and D. Janezic},
title = {Implementation and evaluation of {MPI}-based parallel {MD} program},
journal = {International Journal of Quantum Chemistry},
year = 2001,
volume = 84,
number = 1,
pages = {23--31},
month = JUL,
abstract = {The message-passing interface (MPI)-based object-oriented particle-particleinteractions (PPI) library is implemented and evaluated. The library can be used in the ii-particle simulation algorithm designed for a ring of p interconnected processors. The parallel simulation is scalable with the numberof processors, and has the time requirement proportional to n(2)/p if n/p is large enough, which guarantees optimal speedup. III a certain range of problem sizes, the speedup becomes superlinear because enough cache memory is available in the system. The library is used in a simple way by any potential user, even with no deep programming knowledge. Different simulations using particles can be implemented on a wide spectrum of different computer platforms. The main purpose of this article is to test the PPI library on well-known methods, e.g., the parallel molecular dynamics (MD) simulation ofthe monoatomic system by the second-order leapfrog Verlet algorithm. The performances of the parallel simulation program implemented with the proposed library are competitive with a custom-designed simulation code. Also, theimplementation of the split integration symplectic method, based on the analytical calculation of the harmonic part of the particle interactions, is shown, and its expected performances are predicted.}
}


@Article{ahm01:mpi-alg,
author = {I. Ahmad},
title = {A distributed algorithm for finding prime compatibles on network of workstations},
journal = {Microprocessors and Microsystems},
year = 2001,
volume = 25,
number = 4,
pages = {195--202},
month = JUN,
abstract = {State minimization of incompletely specified finite state machines (FSMs)isan important step of FSM synthesis. Generation of prime compatibles is oneof the core steps in state minimization of incompletely specified FSMs. Itis guaranteed that a minimal solution exist, consisting of prime compatibles only. But the generation of prime compatibles is both a compute-intensive and a memory-intensive problem. In this paper, we have developed and implemented a distributed algorithm, designated as D_Prime, to find prime compatibles on network of workstations (NOWs) under message passing interface (MPI) environment to handle the large complexity of VLSI designs in future. With the advent of high-speed networks and availability of powerful high-performance workstations, NOW has emerged as the most cost-effective platform for compute-intensive and memory-intensive applications. Comparison of results on a number of MCNC benchmarks for state minimization of incompletely specified FSMs showed that a considerable speedup can be achieved by the proposed distributed algorithm as compared with the existing sequential counterparts.}
}


@Article{ino01:mpi-model,
author = {F. Ino and N. Fujimoto and K. Hagihara},
title = {{LogGPS}: A parallel computational model for synchronization analysis},
journal = {ACM SIGPLAN Notices},
year = 2001,
volume = 36,
number = 7,
pages = {133--142},
month = JUL,
abstract = {We also present some experimental results using both models. The results include (1) a verification of the LogGPS model, (2) an example of synchronization analysis using an MPI program and (3) a comparison of the models. The results indicate that the LogGPS model is more accurate than the LogGP model, and analyzing synchronization costs is important when improving parallel program performance.}
}


@Article{zha01:mpi-app,
author = {W. S. Zhang and G. Q. Zhang},
title = {Prestack depth migration by hybrid method with high precision and its parallel implementation},
journal = {Chinese Journal of Geophysics-Chinese Edition},
year = 2001,
volume = 44,
number = 4,
pages = {542--551},
month = JUL,
abstract = {Prestack depth migration is an important imaging method for complex geological structures. In this paper a generalized system of wavefield continuation is presented based on the wavefield splitting theory. The system is coupled by downgoing and upgoing waves, and the commonly used equation of wavefield continuation is only a special case of the coupled system. Based on theapproximation of square root operator, a new hybrid migration method with high precision is derived. The method can be implemented numerically through splitting technique. Finally, two numerical migration examples are given,one is the poststack depth migration for a model with large lateral velocity contrasts, another is the prestack depth migration for Marmousi model with complex structures. The numerical results show the effectiveness and high precision of the method. The MPI parallel calculation is adopted in orderto raise computational efficiency. The method can be used to obtain precise images for complex structures with large lateral velocity variations.}
}



@Article{ant01:mpi-xxx,
author = {G. Antoniu and L. Bouge and P. Hatcher and M. MacBeth and K. McGuigan and R. Namyst},
title = {The {H}yperion system: Compiling multithreaded {J}ava bytecode for distributed execution},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 10,
pages = {1279--1297},
month = SEP,
abstract = {Our work combines Java compilation to native code with a run-time library that executes Java threads in a distributed memory environment. This allows a Java programmer to view a cluster of processors as executing a single JAVA virtual machine. The separate processors are simply resources for executing Java threads with true parallelism, and the run-time system provides theillusion of a shared memory on top of the private memories of the processors. The environment we present is available on top of several UNIX systems and can use a large variety of communication interfaces thanks to the high portability of its run-time system. To evaluate our approach, we compare serial C, serial Java, and multithreaded Java implementations of a branch-and-bound solution to the minimal-cost map-coloring problem. All measurements have been carried out on two platforms using two different communication interfaces: SISCI/SCI and MPI-BIP/Myrinet.}
}

@Article{sar01:mpi-app,
author = {K. C. Sarma and H. Adeli},
title = {Bilevel parallel genetic algorithms for optimization of large steel structures},
journal = {Computer-Aided Civil and Infrastructure Engineering},
year = 2001,
volume = 16,
number = 5,
pages = {295--304},
month = SEP,
abstract = {This article is concerned with optimization of very large steel structures subjected to the actual constraints of the American Institute of Steel Construction ASD and LRFD specifications on high-performance multiprocessor machines using biologically inspired genetic algorithms. First, parallel fuzzygenetic algorithms (GAs) are presented for optimization of steel structures using a distributed memory Message Passing Interface (MPI) with two different schemes: the processor farming scheme and the migration scheme. Next, two bilevel parallel GAs are presented for large-scale structural optimization through judicious combination of shared memory data parallel processingusing the OpenMP Application Programming Interface (API) and distributed memory message passing parallel processing using MPI. Speedup results are presented for parallel algorithms.}
}

@Article{yil01:mpi-app,
author = {E. Yilmaz, E and M. S. Kavsaoglu and H. U. Akay and I. S. Akmandor},
title = {Cell-vertex based parallel and adaptive explicit 3{D} flow solution on unstructured grids},
journal = {International Journal of Computational Fluid Dynamics},
year = 2001,
volume = 14,
number = 4,
pages = {271--286},
abstract = {A parallel adaptive Euler flow solution algorithm is developed for 3D applications on distributed memory computers. Significant contribution of this research is the development and implementation of a parallel grid adaptationscheme together with an explicit cell vertex-based finite volume 3D flow solver on unstructured tetrahedral grids. Parallel adaptation of grids is based on grid-regeneration philosophy by using an existing serial grid generation program. Then, a general partitioner repartitions the grid. An adaptive sensor value, which is a measure to refine or coarsen grids, is calculated considering the pressure gradients in all partitioned blocks of grids. The parallel performance of the present study was tested. Parallel computations were performed on Unix workstations and a Linux cluster using MPI communication library. The present results show that overall adaptation scheme developed in this study is applicable to any pair of a flow solver and grid generator with affordable cost. It is also proved that parallel adaptation is necessary for accurate and efficient flow solutions.}
}


@Article{cha01:mpi-app,
author = {H. Y. Chang and K. C. Huang and C. Y. Shen and S. C. Tcheng and C. Y. Chou},
title = {Parallel computation of a weather model in a cluster environment},
journal = {Computer-Aided Civil and Infrastructure Engineering},
year = 2001,
volume = 16,
number = 5,
pages = {365--373},
month = SEP,
abstract = {Recently, the superior and continuously improving cost-performance ratio ofcommodity hardware and software has made PC clustering a popular alternative for high-performance computing in both academic institutes and industrial organizations. The purpose of this work is to use PC clusters to solve a weather-prediction model in parallel mode, and the result also will be compared with those obtained on some conventional parallel platforms such as the Fujitsu VPP300, IBM SP2 (160 and 120 MHz), and HP SPP2200. Techniques of domain decomposition and data communication are used to exploit parallelismof the model. Interprocessor data communication is done by the Message Passing Interface communication library routines. Two versions of the parallelcodes, one with longitude decomposition and the other with latitude decomposition, are tested and compared. Speedups of the parallel weather model onthese machines with various numbers of processors show that substantial reductions in computation time can be achieved as compared with sequential runs.}
}

@Article{bgl00:mpi-impl,
author = {Ralph Butler and William Gropp and Ewing Lusk},
title = {Components and Interfaces of a Process Management System for Parallel Programs},
journal = {Parallel Computing},
month = OCT,
year = 2001,
volume = 27,
number = 11,
pages = {1417--1429},
abstract = {Parallel jobs are different from sequential jobs and require a different type of process management. We present here a process management system for parallel programs such as those written using MPI. A primary goal of the system, which we call MPD (for multipurpose daemon), is to be scalable. By this we mean that startup of interactive parallel jobs comprising thousands ofprocesses is quick, that signals can be quickly delivered to processes, and that stdin, stdout, and stderr are managed intuitively. Our primary target is parallel machines made up of clusters of SMPs, but the system is also useful in more tightly integrated environments. We describe how MPD enablesfast startup and convenient runtime management of parallel jobs. We show how close control of stdio can support the easy implementation of a number of convenient system utilities, even a parallel debugger. We describe a simple but general interface that can be used to separate any process manager from a parallel library, which we use to keep MPD separate from MPICH.}
}


@Article{myl01:mpi2-impl,
author = {S. Moh and C. S. Yu and B. Lee and H. Y. Youn and D. S. Han and D. Lee},
title = {Four-ary tree-based barrier synchronization for 2{D} meshes without nonmember involvement},
journal = {IEEE Transactions on Computers},
year = 2001,
volume = 50,
number = 8,
pages = {811-823},
month = AUG,
abstract = {This paper proposes a Barrier Tree for Meshes (BTM) to minimize the barrier synchronization latency for two-dimensional (2D) meshes. The proposed BTM scheme has two distinguishing features. First, the synchronization tree is 4-ary. The synchronization latency of the BTM scheme is asymptotically Theta (log(4) n), while that of the fastest scheme reported in the literature is bounded between Omega (log(3) n) and O(n(1/2)), where n is the number of member nodes. Second, nonmember nodes are neither involved in the construction of a BTM nor actively participate in the synchronization operations, which avoids interference among different process groups during synchronization. This not only results in low setup overhead, but also reduces the synchronization latency. The low setup overhead is particularly effective for the dynamic process model provided in MPI-2. Extensive simulation study showsthat, for up to 64 x 64 meshes, the BTM scheme results in about 40 similarto 70 percent shorter synchronization latency and is more scalable than conventional schemes.}
}


@Article{fbd01:mpi-impl,
author = {G. E. Fagg and A. Bukovsky and J. J. Dongarra},
title = {{HARNESS} and fault tolerant {MPI}},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 11,
pages = {1479--1495},
month = OCT,
abstract = {Initial versions of MPI were designed to work efficiently on multi-processors which had very little job control and thus static process models. Subsequently forcing them to support a dynamic process model would have affected their performance. As current HPC systems increase in size with greater potential levels of individual node failure, the need arises for new fault tolerant systems to be developed. Here we present a new implementation of MPI called fault tolerant MPI (FT-MPI) that allows the semantics and associatedmodes of failures to be explicitly controlled by an application via a modified MPI API. Given is an overview of the FT-MPI semantics, design, exampleapplications, debugging tools and some performance issues. Also discussed is the experimental HARNESS core (G\_HCORE) implementation that FT-MPI is built to operate upon.}
}


@Article{kbg01:mpi-impl,
author = {T. Kielmann and H. E. Bal and S. Gorlatch and K. Verstoep and R. F. H. Hofman},
title = {Network performance-aware collective communication for clustered wide-area systems},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 11,
pages = {1431--1456},
month = OCT,
abstract = {Metacomputing infrastructures couple multiple clusters (or MPPs) via wide-area networks. A major problem in programming parallel applications for suchplatforms is their hierarchical network structure: latency and bandwidth of WANs often are orders of magnitude worse than those of local networks. Our goal is to optimize MPI's collective operations for such platforms. We use two techniques: selecting suitable communication graph shapes, and splitting messages into multiple segments that are sent in parallel over different WAN links. To optimize graph shape and segment size at runtime, we introduce a performance model called Parameterized Log P (P - Log P), a hierarchical extension of the Log P model that covers messages of arbitrary length. An experimental performance evaluation shows that the newly implemented collective operations have significantly improved performance for large messages, and that there is a close match between the theoretical model and the measured completion times.}
}


@Article{ll01:mpi-openmp,
author = {G. R. Luecke and W. H. Lin},
title = {Scalability and performance of {OpenMP} and {MPI} on a 128-processor {SGI} {Origin2000}},
journal = {Concurrency and Computation-Practice \& Experience},
year = 2001,
volume = 13,
number = 10,
pages = {905--928},
month = AUG,
abstract = {The purpose of this paper is to investigate the scalability and performanceof seven, simple OpenMP test programs and to compare their performance with equivalent MPI programs on an SGI Origin 2000. Data distribution directives were used to make sure that the OpenMP implementation had the same data distribution as the MPI implementation. For the matrix-times-vector (test 5) and the matrix-times-matrix (test 7) tests, the syntax allowed in OpenMP 1.1 does not allow OpenMP compilers to be able to generate efficient code since the reduction clause is not currently allowed for arrays. (This problem is corrected in OpenMP 2.0.) For the remaining five tests, the OpenMP version performed and scaled significantly better than the corresponding MPI implementation, except for the right shift test (test 2) for a small message.}
}


@Article{pas01:mpi-app,
author = {G. Passoni and P. Cremonesi and G. Alfonsi},
title = {Analysis and implementation of a parallelization strategy on a {N}avier-{S}tokes solver for shear flow simulations},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 13,
pages = {1665--1685},
month = DEC,
abstract = {A parallel computational solver for the unsteady incompressible three-dimensional Navier-Stokes equations implemented for the numerical simulation of shear flow cases is presented. The computational algorithms include Fourierexpansions in the streamwise and spanwise directions, second-order centered finite differences in the direction orthogonal to the solid walls, third-order Runge-Kutta procedure in time in which both convective and diffusive terms are treated explicitly; the fractional step method is used for time marching. Based on the numerical algorithms implemented within the computational solver, three different (MPI based) parallelization strategies are devised. The three schemes are evaluated with particular attention to the impact of the communications onto the whole computational procedure, and one ofthem is implemented. Computations are executed on two different parallel machines and results are shown in terms of parallel performance. Processes using different number of processors combined with different number of computational grid points are tested.}
}


@Article{ber01:mpi-openmp,
author = {J. Y. Berthou and E. Fayolle},
title = {Comparing {OpenMP}, {HPF}, and {MPI} programming: A study case},
journal = {International Journal of High Performance Computing Applications},
year = 2001,
volume = 15,
number = 3,
pages = {297--309},
abstract = {This paper presents a comparison of three programming models-OpenMP, HPF, and MPI-applied to a diphasic compressible fluid mechanics code. The parallelization analysis is conducted, and the authors also present the experimental results obtained on various platforms: a Compaq Proliant 6000 (4 processors), a Cray T3E-750 (300 processors), an HP Class V (16 processors), a SG1Origin 2000 (32 processors), a cluster of PCs, and a COMPAQ SC 232 (232 processors). These experimental results will be discussed according to the following criteria: efficiency, scalability, maintainability, developing costs, and portability. As a conclusion, the authors present the parallelization strategy recommended for codes comparable to ECOSS.}
}


@Article{ber01:mpi-alg,
author = {L. Bergamaschi and I. Moret and G. Zilli},
title = {Inexact {Q}uasi-{N}ewton methods for sparse systems of nonlinear equations},
journal = {Future Generation Computer Systems},
year = 2001,
volume = 18,
number = 1,
pages = {41--53},
month = SEP,
abstract = {In this paper, we present the results obtained by solving consistent sparsesystems of n nonlinear equations F(x) = 0, by a Quasi-Newton method combined with a p block iterative row-projection linear solver of Cimmino type, 1 less than or equal to $p << n$. Under weak regularity conditions for F, it is proved that this Inexact Quasi-Newton method has a local, linear convergence in the energy norm induced by the preconditioned matrix HA, where A is an initial guess of the Jacobian matrix, and it may converge too superlinearly. The matrix H = [A(1)(+),...,A(i)(+),...,A(p)(+)], where A(i)(+) = A(i)(T)(A(i)A(i)(T))(-1) is the Moore-Penrose pseudo-inverse of the mi x n block A(i), the preconditioner. A simple partitioning of the Jacobian matrix was used for solving a set of nonlinear test problems with sizes ranging from 1024 to 131 072 on the CRAY T3E under the MPI environment.}
}



@Article{neo01:mpi-tool,
author = {N. Neophytou and P. Evripidou},
title = {{Net-dbx}: A web-based debugger of {MPI} programs over low-bandwidth lines},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2001,
volume = 12,
number = 9,
pages = {986--995},
month = SEP,
abstract = {This paper describes Net-dbx, a tool that utilizes Java and other World Wide Web tools for the debugging of MPI programs from anywhere in the Internet. Net-dbx is a source-level interactive debugger with the full power of gdb(the GNU Debugger) augmented with the debug functionality of the public-domain MPI implementation environments. The main effort was on a low overhead, yet powerful, graphical interface supported by low-bandwidth connections.The portability of the tool is of great importance as well because it enables the tool to be used on heterogeneous nodes that participate in an MPI multicomputer. Both needs are satisfied a great deal by the use of WWW browsing tools and the Java programming language. The user of our system simply points his/her browser to the Net-dbx page, logs in to the destination system, and starts debugging by interacting with the tool, just as with any GUIenvironment. The user can dynamically select which MPI processes to view/debug. A special WWW-based environment has been designed and implemented to host the system prototype.}
}


@Article{ree01:mpi-alg,
author = {J. S. Reeve and A. D. Scurr and J. H. Merlin},
title = {Parallel versions of {S}tone's strongly implicit algorithm},
journal = {Concurrency and Computation-Practice \& Experience,},
year = 2001,
volume = 13,
number = 12,
pages = {1049--1062},
month = OCT,
abstract = {In this paper, we describe various methods of deriving a parallel version of Stone's Strongly Implicit Procedure (SIP) for solving sparse linear equations arising from finite difference approximation to partial differential equations (PDEs). Sequential versions of this algorithm have been very successful in solving semi-conductor, heat conduction and flow simulation problems and an efficient parallel version would enable much larger simulations to be run. An initial investigation of various parallelizing strategies was undertaken using a version of high performance Fortran (HPF) and the best methods were reprogrammed using the MPI message passing libraries for increased efficiency. Early attempts concentrated on developing a parallel version of the characteristic wavefront computation pattern of the existing sequential SIP code. However, a red-black ordering of grid points, similar to that used in parallel versions of the Gauss-Seidel algorithm, is shown to be far more efficient. The results of both the wavefront and red-black MPI based algorithms are reported for various size problems and number of processors on a sixteen node IBM SP2.}
}


@Article{kre01:mpi-app,
author = {H. Kremer and F. May and S. Wirtz},
title = {The influence of furnace design on the {NO} formation in high temperature processes},
journal = {Energy Conversion and Management},
year = 2001,
volume = 42,
number = {15--17},
pages = {1937--1952},
month = {Oct-Nov},
abstract = {High temperature processes produce high NO, emissions due to their elevatedworking temperatures. Strong regulations for emissions of pollutants [1] from industrial plants lead the operators to optimize their furnaces. In this paper a three-dimensional mathematical model for turbulent flow and combustion on the basis of turbulence-chemistry interactions and radiative heat transfer taking into account spectral effects of surrounding walls and combustion gases is described. The transport equation for radiative intensity was split into different wavelength ranges. A block-structured finite volumegrid with local refinements was used to solve the governing equations. Thecalculation domain is subdivided into a number of subdomains which are linked within the solver based on the message passing interface (MPI) library.Computed distributions of velocity, temperature, species distribution and heat fluxes are given. Results of a parametric study in a producing horseshoe furnace by increasing the height of the furnace with regard to NO, concentration distributions are presented.}
}


@Article{he01:mpi-alg,
author = {X. He and C. H. Huang},
title = {Communication efficient {BSP} algorithm all nearest smaller values problem},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 10,
pages = {1425--1438},
month = OCT,
abstract = {We present a BSP (Bulk Synchronous Parallel) algorithm for solving the All Nearest Smaller Values Problem (ANSVP), a fundamental problem in both graphtheory and computational geometry. Our algorithm achieves optimal sequential computation time and uses only three communication supersteps. In the worst case, each communication phase takes no more than an (n/p + p)-relation, where p is the number of the processors. In addition, our average-case analysis shows that, on random inputs, the expected communication requirements for all three steps are bounded above by a p-relation, which is independent of the problem size n. Experiments have been carried out on an SGI Origin 2000 with 32 R10000 processors and a SUN Enterprise 4000 multiprocessing server supporting 8 UltraSPARC processors, using the MPI libraries. The results clearly demonstrate the communication efficiency and load balancing for computation.}
}


@Article{bea01:mpi-app,
author = {O. Beaumont and V. Boudet and F. Rastello and Y. Robert},
title = {Matrix multiplication on heterogeneous platforms},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2001,
volume = 12,
number = 10,
pages = {1033-1051},
month = OCT,
abstract = {In this paper, we address the issue of implementing matrix multiplication on heterogeneous platforms. We target two different classes of heterogeneouscomputing resources: heterogeneous networks of workstations and collections of heterogeneous clusters. Intuitively, the problem is to load balance the work with different speed resources while minimizing the communication volume. We formally state this problem in a geometric framework and prove itsNP-completeness. Next, we introduce a (polynomial) column-based heuristic,which turns out to be very satisfactory: We derive a theoretical performance guarantee for the heuristic and we assess its practical usefulness through MPI experiments.}
}


@Article{ban01:mpi-impl,
author = {M Banikazemi and R. K. Govindaraju and R. Blackmore and D. K. Panda},
title = {MPI-LAPI: An efficient implementation of MPI for IBM RS/6000 SP systems},
journal = {IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS},
year = 2001,
volume = 12,
number = 10,
pages = {1081--1093},
month = OCT,
abstract = {The IBM RS/6000 SP system is one of the most cost-effective commercially available high performance machines. IBM RS/6000 SP systems support the Message Passing Interface standard (MPI) and LAPI. LAPI is a low level, reliable, and efficient one-sided communication API library implemented on IBM IRS/6000 SP systems. This paper explains how the high performance of the LAPI library has been exploited in order to implement the MPI standard more efficiently than the existing MPI. It describes how to avoid unnecessary data copies at both the sending and receiving sides for such an implementation. The resolution of problems arising from the mismatches between the requirements of the MPI standard and the features of LAPI is discussed. As a result of this exercise, certain enhancements to LAPI are identified to enable an efficient implementation of MPI on LAPI. The performance of the new implementation of MPI is compared with that of the underlying LAPI itself. The latency (in polling and interrupt modes) and bandwidth of our new implementation is compared with that of the native MPI implementation on RS/6000 SP systems. The results indicate that the MPI implementation on LAPI performs comparably to or better than the original MPI implementation in most cases. Improvements of up to 17.3 percent in polling mode latency, 35.8 percent in interrupt mode latency, and 20.9 percent in bandwidth are obtained for certain message sizes. The implementation of MPI on top of LAPI also outperforms the native MPI implementation for the NAS Parallel Benchmarks.}
}


@Article{liRa01:mpi-app,
author = {M. Z. Li and O. F. Rana and D. W. Walker},
title = {Wrapping {MPI}-based legacy codes as {Java/CORBA} components},
journal = {Future Generation Computer Systems},
year = 2001,
volume = 18,
number = 2,
pages = {213--223},
month = OCT,
abstract = {Techniques for wrapping an MPI-based molecular dynamics (MD) simulation code as Java/CORBA components, for use within a distributed component based problem solving environment (CB-PSE), is presented. A legacy code for simulating a Lennard-Jones fluid is first wrapped as a single CORBA object, followed by division of the code into computational sub-units, where each sub-unit is wrapped as a CORBA object containing MPI calls, and run on a cluster of workstations - enabling different MPI implementations to inter-operate. Using a Java implementation, users can submit simulation tasks through a Webbased inter-face, without needing to know implementation details of the legacy code, or the exact interaction between sub-units within the code. We provide performance comparisons of wrapping the entire MD code as a single object versus wrapping sub-units within it, and offer a simple performance model to explain our findings.}
}

@Article{beau01:mpi-app,
author = {O. Beaumont and V. Boudet and A. Petitet and F. Rastello and Y. Robert},
title = {A proposal for a heterogeneous cluster {ScaLAPACK} (dense linear solvers)},
journal = {IEEE Transactions on Computers},
year = 2001,
volume = 50,
number = 10,
pages = {1052--1070},
month = OCT,
abstract = {In this paper, we study the implementation of dense linear algebra kernels,such as matrix multiplication or linear system solvers, on heterogeneous networks of workstations. The uniform block-cyclic data distribution scheme commonly used for homogeneous collections of processors limits the performance of these linear algebra kernels on heterogeneous grids to the speed of the slowest processor. We present and study more sophisticated data allocation strategies that balance the load on heterogeneous platforms with respect to the performance of the processors. When targeting unidimensional grids, the load-balancing problem can be solved rather easily. When targeting two-dimensional grids, which are the key to scalability and efficiency for numerical kernels, the problem turns out to be surprisingly difficult. We formally state the 2D load-balancing problem and prove its NP-completeness. Next, we introduce a data allocation heuristic, which turns out to be very satisfactory: Its practical usefulness is demonstrated by MPI experiments conducted with a heterogeneous network of workstations.}
}

@Article{corn01:mpi-app,
author = {C. F. Cornwell and L. T. Wille and Y. G. Wu and F. H. Sklar},
title = {Parallelization of an ecological landscape model by functional decomposition},
journal = {Ecological Modelling},
year = 2001,
volume = 144,
pages = {13-20},
month = OCT,
abstract = {A functional scheme is described to parallelize computer simulations of grid-based ecological landscape models. The method is implemented using the Message Passing Interface protocol and is applied to the Everglades LandscapeVegetation Model. On a two-processor system, the speed-up is satisfactory and the overall performance of the program is competitive with traditional parallelization techniques such as geometrical decomposition. The method isdiscussed, timing information is provided for three different parallel machines, and some further developments are indicated.}
}


@Article{sama01:mpi-app,
author = {M. Y. Saman and D. J. Evans},
title = {Distributed computing on cluster systems},
journal = {International Journal of Computer Mathematics},
year = 2001,
volume = 78,
number = 3,
pages = {383--397},
abstract = {Message Passing Interface (MPI) allows a group of computers in a network tobe specified as a cluster system. It provides the routines for task activation and communication. Writing programs for a cluster system is a difficult job. In this paper: the Message+passing Interface is presented. Parallel programs using the WMPI, a version of MPI, to solve the pi(pi) calculation the quick sort algorithm and the Torsion problem are presented. The programs are written and compiled in Microsoft Visual C++.}
}


@Article{raas01:mpi-app,
author = {S. Raasch and M. Schroter},
title = {{PALM}---{A} large-eddy simulation model performing on massively parallel computers},
journal = {Meteorologische Zeitschrift},
year = 2001,
volume = 10,
number = 5,
pages = {363--372},
abstract = {An existing code of a large-eddy simulation (LES) model for the study of turbulent processes in the atmospheric and oceanic boundary layer has been completely recoded for use on massively parallel systems with distributed memory. Parallelization is achieved by two-dimensional domain decomposition and communication is realized by the message passing interface (MPI). Periodic boundary conditions, which are used in both horizontal directions, helpedto minimize the parallelization effort. The performance of the new PArallelized LES Model (PALM) is excellent on SGI/Cray-T3E systems and an almost linear speed-up is achieved up to very large numbers of processors. Parallelization strategy and model performance is discussed and validation experiments as well as future applications are presented.}
}

@Article{lu01:mpi-app,
author = {P. Lu},
title = {Integrating bulk-data transfer into the {A}urora distributed shared data system},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 11,
pages = {1609--1632},
month = NOV,
abstract = {The Aurora distributed shared data system implements a shared-data abstraction on distributed-memory platforms, such as clusters, using abstract data types. Aurora programs are written in C++ and instantiate shared-data objects whose data-sharing behaviour can be optimized using a novel technique called scoped behaviour. Each object and each phase of the computation (i.e.,use-context) can be independently optimized with per-object and per-context flexibility. Within the scoped behaviour framework, optimizations such asbulk-data transfer can be implemented and made available to the application programmer. Scoped behaviour carries semantic information regarding the specific data-sharing pattern through various layers of software. We describe how the optimizations are integrated from the uppermost application-programmer layers down to the lowest UDP-based layers of the Aurora system. A bulk-data transfer network protocol bypasses some bottlenecks associated withTCP/IP and achieves higher performance on an ATM network than either TreadMarks (distributed shared memory) or MPICH (message passing) for matrix multiplication and parallel sorting.}
}

@Article{brig01:mpi-impl,
author = {R. Brightwell and S. Plimpton},
title = {Scalability and performance of two large {L}inux clusters},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 11,
pages = {1546--1569},
month = NOV,
abstract = {In this paper, we present performance results from several parallel benchmarks and applications on two large Linux clusters at Sandia National Laboratories. We compare the results on the Linux clusters to performance obtainedon a traditional distributed-memory massively parallel processing machine,the Intel TeraFLOPS. We discuss the characteristics of these machines thatinfluence the performance results and identify the key components of the system that are important to allow for scalability of commodity-based PC clusters to hundreds and possibly thousands of processors.}
}


@Article{diPi01:mpi-app,
author = {M. Di Pierro},
title = {Matrix distributed processing: a set of {C++} tools for implementing generic lattice computations on parallel systems},
journal = {Computer Physics Communications},
year = 2001,
volume = 141,
number = 1,
pages = {98--148},
month = NOV,
abstract = {We present a set of programming tools (classes and functions written in C++and based on Message Passing Interface) for fast development of generic parallel (and non-parallel) lattice simulations. They are collectively calledMDP 1. 2. These programming tools include classes and algorithms for matrices, random number generators, distributed lattices (with arbitrary topology), fields and parallel iterations. No previous knowledge of MPI is required in order to use them. Some applications in electromagnetism, electronics,condensed matter and lattice QCD are presented,}
}

@Article{ahan01:mpi-app,
author = {X. Zhang and B. Wang and Z. Z. Ji},
title = {Performance of a parallel finite difference atmospheric general circulation model},
journal = {Advances in Atmospheric Sciences},
year = 2001,
volume = 18,
number = 6,
pages = {1175--1184},
abstract = {A new version of the Institute of Atmospheric Physics (IAP) 9-Layer (9L) atmospheric general circulation model (AGCM) suitable for Massively Parallel Processor (MPP) has been developed. This paper presents the principles of the parallel code design and examines its performance on a variety of state-of-the-art parallel computers in China. Domain decomposition strategy is used to achieve parallelism that is implemented by Message Passing Interface (MPI). Only the one dimensional domain decomposition algorithm is shown to scale favorably as the number of processors is increased.}
}

@Article{boul01:mpi-app,
author = {C. Bouldin and J. Sims and H. Hung and J. J. Rehr and A. L. Ankudinov},
title = {Rapid calculation of x-ray absorption near edge structure using parallel computation},
journal = {X-Ray Spectrometry},
year = 2001,
volume = 30,
number = 6,
pages = {431--434},
month = {Nov.-Dec.},
abstract = {Modeling x-ray absorption near edge structure (XANES) requires computationally intensive calculations. We show that parallel processing can reduce thetime required for XANES calculations by a factor of up to 50 over standarddesktop computers. Parallel processing is implemented in our codes using the Message Passing Interface (MPI) and is portable across most hardware andoperating systems. We demonstrate the inverse scaling of the parallel algorithm with the number of processors, and discuss how this approach to parallel processing could be implemented in other multiple-scattering calculations. Faster calculations should improve the applicability of ab initio XANESstudies to many materials science problems.}
}

@Article{behr01:mpi-app,
author = {M. Behr},
title = {Stabilized space-time finite element formulations for free-surface flows},
journal = {Communications in Numerical Methods in Engineering},
year = 2001,
volume = 17,
number = 11,
pages = {813--819},
month = NOV,
abstract = {Aspects of a method for 3D finite element computation of unsteady, incompressible free-surface flow are presented. The approach is based on the deformable-spatial-domain/stabilized space-time (DSD/SST) finite element formulation, which takes automatically into account the deformation of the elementsin response to the motion of the free surface. The free-surface elevation is governed by a kinematic free-surface condition, which is also solved with a stabilized formulation. A new governing equation and stabilized formulation is derived for cases where the channel walls are not vertical. The parallel implementation based on MPI message-passing standard is fully portable, and have been demonstrated to be scalable on a range of architectures. A 3D computation of a flow past a spillway of a dam is shown as an example application.}
}


@Article{he02:mpi-app,
author = {F. S. He and H. Wu},
title = {An efficient parallel implementation of the {E}verglades {L}andscape {F}ire {M}odel using checkpointing},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 1,
pages = {65--82},
month = JAN,
abstract = {This paper presents a low-communication overhead and high-performance data parallelism implementation of the Everglades Landscape Fire Model (ELFM) ina network of workstations (NOWs). ELFM is parallelized under Message Passing Interface (MPI). Checkpointing and rollback technologies are used to handle the spread of fire which is a dynamic and irregular component of the model. A parallel application model with the mixture of a variety of asynchronous and synchronous computation is developed. In this model, the asynchronous computation is dominant and synchronous computation is intermittent. The length of each synchronous computation also varies. Based on the developed model, a synchronous check-pointing mechanism is used in the parallel ELFM code under MPI. A simulation is conducted and results show that the performance of the ELFM under MPI is significantly enhanced by the application of checkpointing and rollback. }
}

@Article{soda02:mpi-app,
author = {A. C. Sodan},
title = {Applications on a multithreaded architecture: A case study with {EARTH-MANNA}},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 1,
pages = {3--33},
month = JAN,
abstract = {Multithreading offers benefits with respect to the formulation of irregulardynamic programs and their dynamic scheduling, load balancing and interaction. Furthermore, low-cost communication on distributed-memory machines by remote-memory access is provided by some systems for efficient communication. EARTH is one of the few systems which combines both, while most other systems either focus on communication or provide multithreading in shared-memory environments. Dynamic irregular applications are often awkward to parallelize on distributed memory when using SPMD style programming via MPI and show different requirements for formulation. In addition, dynamic irregularapplications also may show a fairly tight data coupling. Systems like EARTH are beneficial then, because they specifically support large number of small data exchanges by providing short startup times and the tolerance of even small latencies (offering very fine-grain threads). However, static regular applications with tight data coupling are supported too. On the exampleof EARTH, this paper investigates the benefits of low-cost communication and multithreading, parallelizing three AI applications with medium to high communication intensity. We present experimental results obtained on the MANNA machine.}
}

@Article{wang02:mpi-app,
author = {P. Wang and K. Y. Liu and T. Cwik and R. Green},
title = {{MODTRAN} on supercomputers and parallel computers},
journal = {PARALLEL COMPUTING},
year = 2002,
volume = 28,
number = 1,
pages = {53--64},
month = JAN,
abstract = {To enable efficient reduction of large data sets such as is done in the Airborne Visible/Infrared Imaging Spectrometer (AVIRIS) project at the Jet Propulsion Laboratory (JPL), a high performance version of MODTRAN is essential. One means to accomplish this is to apply the computational resources of parallel computer systems. In our present work, a flexible, parallel version of MODTRAN has been implemented on the Cray T3E, the HP SPP2000, and a Beowulf-class cluster computer using domain decomposition techniques and the Message Passing Interface (MPI) library. In this paper, porting the sequential MODTRAN to various platforms is discussed; strategies of designing a parallel version of MODTRAN are developed; detailed implementation for a parallel MODTRAN is reported, and performance data of the parallel code on various computers are presented. Near linear scaling performance of parallel MODTRAN has been obtained, and comparisons of wallclock time are made among various supercomputers and parallel computers. The parallel version of MODTRAN gives excellent speedup, which dramatically reduces total data processing time for many applications such as the AVIRIS project at JPL.}
}

@Article{acac02:mpi-impl,
author = {M. Acacio and O. Canovas and J. M. Garcia and P. E. Lopez-de-Teruel},
title = {{MPI-Delphi}: an {MPI} implementation for visual programming environments and heterogeneous computing},
journal = {Future Generation Computer Systems},
year = 2002,
volume = 18,
number = 3,
pages = {317--333},
month = JAN,
abstract = {The goal of a parallel program is to reduce the execution time, compared tothe fastest sequential program solving the same problem. Parallel programming is growing due to the widespread use of network of workstations (NOWs) or powerful PCs in high-performance computing. Because the hardware components are all commodity devices, NOWs are much more cost-effective than custom machines with similar technology. In this environment, the typical programming model used has been message-passing and the MPI library has become the standard in the distributed-memory computing model. On the other hand, visual programming environments try to simply the task of developing applications. They provide programmers with several standard components for creating programs. Delphi constitutes one of the most popular visual programming environments nowadays in the Windows market place. In this paper, we presentMPI-Delphi, an implementation of MPI for writing parallel applications using Delphi visual programming environment. We show how MPI-Delphi has been developed, and how it makes possible to manage a cluster of homogeneous/heterogeneous PCs. Two examples of use of MPI-Delphi in a heterogeneous clusterof workstations with a mixture of Windows and Linux operating systems are also included. The MPI-Delphi interface is suitable for some specific kindsof problems, such as monitoring parallel programs of long execution time, or computationally intensive graphical simulations. In addition, MPI-Delphihas proven to be a good tool for research, as the development of new algorithms can be carried out quickly and, therefore, time spent on the debugging of such algorithms is reduced. Finally, we conclude by explaining some of the tasks we think MPI-Delphi is suitable for.}
}

@Article{thak02:mpi-impl,
author = {R. Thakur and W. Gropp and E. Lusk},
title = {Optimizing noncontiguous accesses in {MPI-IO}},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 1,
pages = {83--105},
month = JAN,
abstract = {The I/O access patterns of many parallel applications consist of accesses to a large number of small, noncontiguous pieces of data. If an application's I/O needs are met by making many small, distinct I/O requests, however, the I/O performance degrades drastically. To avoid this problem, MPI-IO allows users to access noncontiguous data with a single I/O function call, unlike in Unix I/O. In this paper, we explain how critical this feature of MPI-IO is for high performance and how it enables implementations to perform optimizations. We first provide a classification of the different ways of expressing an application's I/O needs in MPI-IO - we classify them into four levels, called levels 0-3. We demonstrate that, for applications with noncontiguous access patterns, the I/O performance improves dramatically if userswrite their applications to make level-3 requests (noncontiguous, collective) rather than level-0 requests (Unix style). We then describe how our MPI-IO implementation, ROMIO, delivers high performance for noncontiguous requests. We explain in detail the two key optimizations ROMIO performs: data sieving for noncontiguous requests from one process and collective I/O for noncontiguous requests from multiple processes. We describe how we have implemented these optimizations portably on multiple machines and file systems,controlled their memory requirements, and also achieved high performance. We demonstrate the performance and portability with performance results forthree applications - an astrophysics-application template (DIST3D), the NAS BTIO benchmark, and an unstructured code (UNSTRUC) - on five different parallel machines: HP Exemplar, IBM SP, Intel Paragon, NEC SX-4, and SGI Origin2000. }
}

@Article{hell02:mpi-impl,
author = {H. Hellwagner and M. Ohlenroth},
title = {{VI} architecture communication features and performance on the {G}iganet cluster {LAN}},
journal = {Future Generation Computer Systems},
year = 2002,
volume = 18,
number = 3,
pages = {421--433},
month = JAN,
abstract = {The virtual interface (VI) architecture standard was developed to satisfy the need for a high throughput, low latency communication system required for cluster computing. VI architecture aims to close the performance gap between the bandwidths and latencies provided by the communication hardware andvisible to the application, respectively, by minimizing the software overhead on the critical path of the communication. This paper presents the results of a performance study of one VI architecture hardware implementation, the Giganet cLAN (cluster LAN). The focus of the study is to assess and compare the performance of different VI architecture data transfer modes and specific features that are available to higher-level communication software like MPI in order to aid the implementor to decide which VI architecture options to employ for various communication scenarios. Examples of such options include the use of send/receive vs. RDMA data transfers, polling vs. blocking to check completion of communication operations, multiple VIs, completion queues and scatter capabilities of VI architecture. }
}

@Article{liLi01:mpi-app,
author = {Y. M. Li and J. L. Liu and T. S. Chao and S. M. Sze},
title = {A new parallel adaptive finite volume method for the numerical simulation of semiconductor devices},
journal = {Computer Physics Communications},
year = 2001,
volume = 142,
number = {1--3},
pages = {285--289},
month = DEC,
abstract = {Based on adaptive finite volume approximation, a posteriori error estimation, and monotone iteration, a novel system is proposed for parallel simulations of semiconductor devices. The system has two distinct parallel algorithms to perform a complete set of I-V simulations for any specific device model. The first algorithm is a domain decomposition on I-irregular unstructured meshes whereas the second is a parallelization of multiple I-V points. Implemented on a Linux cluster using message passing interface libraries, both algorithms are shown to have excellent balances on dynamic loading and hence result in efficient speedup. Compared with measurement data, computational results of sub-micron MOSFET devices are given to demonstrate the accuracy and efficiency of the system. }
}

@Article{iovi01:mpi-app,
author = {M. Iovieno and C. Cavazzoni and D. Tordella},
title = {A new technique for a parallel dealiased pseudospectral {N}avier-{S}tokes code},
journal = {Computer Physics Communications},
year = 2001,
volume = 141,
number = 3,
pages = {365--374},
month = DEC,
abstract = {A novel aspect of a parallel procedure for the numerical simulation of the solution of the Navier-Stokes equations through the Fourier-Galerkin pseudospectral method is presented. It consists of a dealiased ("3/2" rule) transposition of the data that organizes the computations in the distributed direction in such a way that whenever a Fast Fourier Transform must be calculated, the algorithm will employ data stored solely an the proper memory of the processor which is computing it. This provide for the employment of standard routines for the computations of the Fourier transform. The aliasing removal procedure has been directly inserted into the transposition algorithm. The code is written for distributed memory computers, but not specifically for a peculiar architecture. The use on a variety of machines is allowedby the adoption of the Message Passing Interface library. The portability of the code is demonstrated by the similar performances, in particular the high efficiency, that all the machines tested show up to a number of parallel processors equal to 1/2 the truncation parameter N/2. Explicit time integration is used. The present code organization is relevant to physical and mathematical problems which require a three dimensional spectral treatment.}
}

@Article{kepk01:mpi-app,
author = {A. Kepkep and U. Ravaioli and B. Winstead},
title = {Cluster-based parallel 3-{D} {M}onte {C}arlo device simulation},
journal = {VLSI Design},
year = 2001,
volume = 13,
number = {1--4},
pages = {51--56},
abstract = {The recent improvements in the performance of commodity computer have created very favorable conditions for building high performance parallel machines from computer clusters. These are very attractive for 3-D device simulation, necessary to model properly carrier-carrier interaction and granular doping effects in deeply scaled silicon devices. We have developed a parallel3-D Monte Carlo simulation environment customized for clusters using the Message Passing Library (MPI). The code has been tested on the supercluster of NCSA at the University of Illinois. We present here test results for an n-i-n diode structure, along with an analysis of performance for two different domain decomposition schemes.}
}

@Article{beck02:mpi-app,
author = {M. Be\v{c}ka and G. Ok\v{s}a and M. Vajter\v{s}ic},
title = {Dynamic ordering for a parallel block-Jacobi SVD algorithm},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 2,
pages = {243--262},
month = FEB,
abstract = {A new approach for the parallel computation of singular value decomposition(SVD) of matrix A is an element of C-mxn is proposed. Contrary to the known algorithms that use a static cyclic ordering of subproblems simultaneously solved in one iteration step, the proposed implementation of the two-sided block-Jacobi method uses a dynamic ordering of subproblems. The dynamic ordering takes into account the actual status of matrix A. In each iterationstep, a set of the off-diagonal blocks is determined that reduces the Frobenius norm of the off-diagonal elements of A as much as possible and, at the same time, can be annihilated concurrently. The solution of this task is equivalent to the solution of the maximum-weight perfect matching problem. The greedy algorithm for the efficient solution of this problem is presented. The computational experiments with both types of ordering, incorporated into the two-sided block-Jacobi method, were performed on an SGI - Cray Origin 2000 parallel computer using the Message Passing Interface (MPI). The results confirm. that the dynamic ordering is much more efficient with regard to the amount of work required for the computation of SVD of a given accuracy than the static cyclic ordering. }
}

@Article{lian02:mpi-app,
author = {Y. Liang and J. Weston and M. Szularz},
title = {Generalized least-squares polynomial preconditioners for symmetric indefinite linear equations},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 2,
pages = {323--341},
month = FEB,
abstract = {Polynomial preconditioners. are frequently used in a parallel environment for the computation of the solution of large-scale sparse linear equations (Ax = b) because of their easy implementation and trivial parallelization. With respect to symmetrical indefinite (SID) linear systems, the use of generalized least-squares (GLS) polynomial preconditioning is preferable to other polynomial preconditioning methods because of the ability to use a three-term recurrence relationship and the low implementation costs. The GLS preconditioning polynomial and its influence on the flexible generalized minimized residual (FGMRES) solver are discussed in this paper. The orthogonal polynomials required in the solution of the least-squares approximation problem are constructed using the Stieltjes procedure in multiple disjoint intervals which exclude the origin. The time-consuming numerical integration associated with this procedure is computed efficiently using Chebyshev polynomials of the first kind and the GLS polynomial reconditioned FGMRES algorithm is implemented using MPI in a highly parallel IBM SP2 environment. Experimental results using classical benchmark systems are presented and compared with those obtained using the recently developed SPAI preconditioned Bi-CGSTAB iterative method. The performance of the GLS preconditioned FGMRES solver is critically accessed.}
}

@Article{beka02:mpi-app,
author = {C. Bekas and E. Gallopoulos},
title = {Parallel computation of pseudospectra by fast descent},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 2,
pages = {223--242},
month = FEB,
abstract = {The pseudospectrum descent method (PsDM) is proposed, a new parallel methodfor the computation of pseudospectra. The idea behind the method is to usepoints from an already existing pseudospectrum level curve partial derivativeA(epsilon), to generate in parallel the points of a new level curve partial derivativeA(delta) such that delta $<$ epsilon. This process can be continued for several steps to approximate several pseudospectrum level curves lying inside the original curve. It is showed via theoretical analysis and experimental evidence that PsDM is embarrassingly parallel, like GRID, and that it adjusts to the geometric characteristics of the pseudospectrum; in particular it captures disconnected components. Results obtained on a parallel system using MPI validate the theoretical analysis and demonstrate interesting load-balancing issues. }
}

@Article{jian02:mpi-app,
author = {D. Jiang and W. Meleis and M. El-Shenawee and E. Mizan and A. Ashouei and C. Rappaport},
title = {Parallel implementation of the steepest descent fast multipole method ({SDFMM}) on a {B}eowulf cluster for subsurface sensing applications},
journal = {IEEE Microwave and Wireless Components Letters},
year = 2002,
volume = 12,
number = 1,
pages = {24--26},
month = JAN,
abstract = {We present the parallel, MPI-based implementation of the SDFMM computer code using a thirty two-node Intel Pentium-based Beowulf cluster. The SDFMM isa fast algorithm that is a hybridization of the method of moments (MoMs), the fast multipole method (FMM), and the steepest descent integration path (SDP), which is used to solve large-scale linear systems of equations produced in electromagnetic scattering problems. An overall speedup of 7.2 has been achieved on the 32-processor Beowulf cluster and a significant reduced runtime is achieved on the 4-processor 667 MHz Alpha workstation.}
}


@Article{dehn02:mpi-app,
author = {F. Dehne and T. Eavis and S. Hambrusch and A. Rau-Chaplin},
title = {Parallelizing the data cube},
journal = {Distributed and Parallel Databases},
year = 2002,
volume = 11,
number = 2,
pages = {181--201},
month = MAR,
abstract = {We have implemented our parallel top-down data cube construction method in C++ with the MPI message passing library for communication and the LEDA library for the required graph algorithms. We tested our code on an eight processor cluster, using a variety of different data sets with a range of sizes, dimensions, density, and skew. Comparison tests were performed on a SunFire 6800. The tests show that our partitioning strategies generate a close to optimal load balance between processors. The actual run times observed show an optimal speedup of p.}
}

@Article{dewa02:mpi-app,
author = {Y. K. Dewaraja and M. Ljungberg and A. Majumdar and A. Bose and K. F. Koral},
title = {A parallel {M}onte {C}arlo code for planar and {SPECT} imaging: implementation, verification and applications in {I-131 SPECT}},
journal = {Computer Methods and Programs in Biomedicine},
year = 2002,
volume = 67,
number = 2,
pages = {115--124},
month = FEB,
abstract = {This paper reports the implementation of the SIMIND Monte Carlo code on an IBM SP2 distributed memory parallel computer. Basic aspects of running Monte Carlo particle transport calculations on parallel architectures are described. Our parallelization is based on equally partitioning photons among the processors and uses the Message Passing Interface (MPI) library for interprocessor communication and the Scalable Parallel Random Number Generator (SPRNG) to generate uncorrelated random number streams. These parallelization techniques are also applicable to other distributed memory architectures.A linear increase in computing speed with the number of processors is demonstrated for Lip to 32 processors. This speed-up is especially significant in Single Photon Emission Computed Tomography (SPECT) simulations involvinghigher energy photon emitters, where explicit modeling of the phantom and collimator is required. For I-131, the accuracy of the parallel code is demonstrated by comparing simulated and experimental SPECT images from a heart/thorax phantom. Clinically realistic SPECT simulations using the voxel-manphantom are carried out to assess scatter and attenuation correction. }
}

@Article{slot02:mpi-app,
author = {J. Slottow and A. Shahriari and M. Stein and X. Chen and C. Thomas and P. B. Ender},
title = {Instrumenting and tuning {dataView} - a networked application for navigating through large scientific datasets},
journal = {Software-Practice \& Experience},
year = 2002,
volume = 32,
number = 2,
pages = {165--190},
month = FEB,
abstract = {This paper describes how we instrumented and tuned the code for improved performance in a networked environment. We report on how we measured network performance, first by inducing network delay and then by running the dataView client component in Washington DC and the compute components in Los Angeles. We report on the effect that tile size, level of detail, and client CPU speed have on performance. We analyze what happens when the geometry computation is performed in parallel using MPI (Message Passing Interface) vs. in serial, and discuss the effect on performance of adding additional computational nodes. }
}

@Article{shan02:mpi-openmp,
author = {H. Z. Shan and J. P. Singh and L. Oliker and R. Biswas},
title = {A comparison of three programming models for adaptive applications on the {Origin2000}},
journal = {Journal of Parallel and Distributed Computing},
year = 2002,
volume = 62,
number = 2,
pages = {241--266},
month = FEB,
abstract = {Adaptive applications have computational workloads and communication patterns that change unpredictably at runtime, requiring dynamic load balancing to achieve scalable performance on parallel machines. Efficient parallel implementations of such adaptive applications is therefore a challenging task.In this paper, we compare the performance of and the programming effort required for two major classes of adaptive applications under three leading parallel programming models on an SGI Origin2000 system, a machine that supports all three models efficiently. Results indicate that the three models deliver comparable performance; however, the implementations differ significantly beyond merely using explicit messages versus implicit loads/stores even though the basic parallel algorithms are similar. Compared with the message-passing (using MPI) and SHMEM programming models, the cache-coherent shared address space (CC-SAS) model provides substantial ease of programming at both the conceptual and program orchestration levels, often accompanied by performance gains. However, CC-SAS currently has portability limitationsand may suffer from poor spatial locality of physically distributed shareddata on large numbers of processors.}
}

@Article{tan02:mpi-app,
author = {C. J. K. Tan},
title = {Solving systems of linear equations with relaxed {M}onte {C}arlo method},
journal = {Journal of Supercomputing},
year = 2002,
volume = 22,
number = 1,
pages = {111--123},
month = MAY,
abstract = {The problem of solving systems of linear algebraic equations by parallel Monte Carlo numerical methods is considered. A parallel Monte Carlo method with relaxation is presented. This is a report of a research in progress, showing the effectiveness of this algorithm. Theoretical justification of thisalgorithm and numerical experiments are presented. The algorithms were implemented on a cluster of workstations using MPI.}
}

@Article{chen01:mpi-app,
author = {D. Chen and T. Aoki and N. Homma and T. Higuchi},
title = {Pragmatic method for the design of fast constant-coefficient combinational multipliers},
journal = {IEEE Proceedings-Computers and Digital Techniques},
year = 2001,
volume = 148,
number = 6,
pages = {196--206},
month = NOV,
abstract = {To characterise and analyse the performance of evolutionary graph generation (EGG) on a cluster of PCs. a parallel version of the EGG system, called the distributed EGG (DEGG) system. has been developed using a message-passing interface (MPI). To demonstrate the capability of DEGG, it is applied to find the optimal design of various multipliers. Experimental results substantially clarify that the DEGG system consistently performs better than the EGG system. Moreover, the ability and solution quality of the DEGG system'ssearch can be further enhanced by the use of the self-adaptation mechanismof operator probabilities.}
}

@Article{marc02:mpi-app,
author = {C. D. Marcos and P. Barge and R. D. Marcos},
title = {Dust dynamics in protoplanetary disks: Parallel computing with {PVM}},
journal = {Journal of Computational Physics},
year = 2002,
volume = 176,
number = 2,
pages = {276--294},
month = MAR,
abstract = {We describe a parallel version of our high-order-accuracy particle-mesh code for the simulation of collisionless protoplanetary disks. We use this code to carry out a massively parallel, two-dimensional. time-dependent. numerical simulation, which includes dust Particles, to study the potential roleof large-scale, gaseous vortices in protoplanetary disks. This noncollisional problem is easy to parallelize oil message-passing multicomputer architectures. We performed file simulations oil a cache-coherent nonuniform memory access Origin 2000 machine. using both the parallel virtual machine (PVM) and message-passing interface (NIPI) message-passing libraries. Our performance analysis suggests that. for our problem, PVM is about 25\% faster than MPI. Using PVM and NIPI Made it possible to reduce CPU little and increase code performance. This allows for simulations with U large number of particle, (N similar to 10(5)-10(6)) in reasonable CPU times, The performances of our implementation of the parallel code on an Origin 2000 supercomputer are presented and discussed. Them exhibit very good speedup behavior and low load unbalancing. Our results confirm that giant gaseous vortices can play a dominant role in giant planet formation.}
}

@Article{ozyo02:mpi-app,
author = {Y. Ozyoruk},
title = {Parallel computation of forward radiated noise of ducted fans with acoustic treatment},
journal = {AIAA Journal},
year = 2002,
volume = 40,
number = 3,
pages = {450--455},
month = MAR,
abstract = {Forward radiated noise of ducted fans is computed numerically on parallel processors solving the three-dimensional, time-dependent Euler equations in body-conformed coordinates with a fourth-order-accurate, finite-difference,Runge-Kutta time-integration scheme. Sound attenuation effects of inlet wall acoustic treatment are included in computations employing a time-discrete form of the standard impedance condition. A distributed computing approach with domain decomposition is used for integrating the equations in parallel using the message passing interface library routines. The abilities of the method are demonstrated with hard- and soft-wall simulations of the JT15D inlet, including flow effects.}
}

@InProceedings{Jones97,
author = "Chris R. Jones and Ambuj K. Singh and Divyakant Agrawal",
title = "Low Latency {MPI} for Meiko {CS}/2 and {ATM} Clusters",
booktitle = "Proceedings of the 11th International Parallel Processing Symposium (IPPS'97)",
publisher = "The Institute of Electrical and Electronics Engineers",
address = "Geneva, Switzerland",
month = apr,
year = "1997",
keywords = "CD-ROM, I/O and Message Passing,",
abstract = "Contains a good overview of existing MPI implementations. Uses a Direct Memory Access method. In order to minimize latency: overlap the transfer of data and send envelope. And this only if the message size is above a certain threshold. First, sending and match envelopes, then DMA.",
}

@InProceedings{Dowd96,
author = "P. W. Dowd and T. M. Carrozzi and F. A. Pellegrino and A. X. Chen",
title = "Native {ATM} Application Programmer Interface Testbed for Cluster-Based Computing",
booktitle = "Proc. 10th Int. Parallel Processing Symp. (IPPS'96) CD-ROM",
publisher = "IEEE",
address = "Honolulu, HA",
month = apr,
year = "1996",
keywords = "Clusters and Domain Decomposition,",
}

@Article{Cotronis:1998:DMA,
author = "Y. Cotronis",
title = "Developing Message-Passing Applications on {MPICH} under Ensemble",
journal = "Lecture Notes in Computer Science",
volume = "1497",
pages = "145--??",
year = "1998",
coden = "LNCSD9",
ISSN = "0302-9743",
bibdate = "Tue Jan 5 08:21:58 MST 1999",
acknowledgement = ack-nhfb,
}

@InProceedings{Roy:2000:MGQ,
author = "Alain J. Roy and Ian Foster and William Gropp and Nicholas Karonis and Volker Sander and Brian Toonen",
title = "{MPICH-GQ}: Quality-of-Service for Message Passing Programs",
editor = "{ACM}",
booktitle = "{SC2000}: High Performance Networking and Computing. Dallas Convention Center, Dallas, {TX}, {USA}, November 4--10, 2000",
publisher = "ACM Press and IEEE Computer Society Press",
address = "New York, NY 10036, USA and 1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA",
pages = "54--54",
year = "2000",
bibdate = "Mon Feb 12 11:57:43 2001",
url = "