% Papers using MPI
%
% This is a partial list, begun in late October, 1997. It is intended to give
% an example of the range of applications that are known to be using
% MPI.
% Note that some author lists are incomplete; if you have a more
% complete reference, please send it to gropp mcs.anl.gov .



@Article{CooFinTseYor97:mpi-groups,
author = {G. Cooperman and L. Finkelstein and M. Tselman and B. York},
title = {Constructing permutation representations for matrix groups},
journal = {Journal of Symbolic Computation},
year = 1997,
volume = 24,
number = {3--4},
month = {Sept.-Oct.},
pages = {471--488},
abstract = {The theory has been successfully tested on a representation of the sporadic simple group Ly, discovered by Lyons (1972). With no a priori assumptions, we find a permutation representation of degree 9606125 on a conjugacy class of subgroups of order 3, find the order of the resulting permutation group, and verify simplicity A Monte Carlo variation of the algorithm was used to achieve better space and time efficiency. The construction of the permutation representation required four CPU days on a SPARC-server 670MP with 64 MB. The permutation representation was used implicitly in the sense that the group element was stored as a matrix, and its permutation action on a ''point'' was determined using a pre-computed data structure. Thus, additional computations required little additional space. The algorithm has also been implemented using the MasPar MP-1 SIMD parallel computer and 8 SPARC-2's running under MPI. The results of those parallel experiments are briefly reviewed.}
}


@Article{AhuLon97:mpi-rk-scattering,
author = {V. Ahuja and L. N. Long},
title = {A parallel finite-volume {R}unge-{K}utta algorithm for electromagnetic scattering},
journal = {Journal of Computational Physics},
year = 1997,
volume = 137,
number = 2,
month = NOV,
pages = {299--320},
abstract = {A 3D explicit finite volume algorithm has been developed to simulate scattering from complex geometries on parallel computers using structured body conformal curvilinear grids. Most simulations for practical 3D geometries require a large number of grid points for adequate spatial resolution making them suitable to parallel computation. The simulations have been carried out using a multi-block/zonal approach in the message passing paradigm on the SP-2. Each zone is placed on a separate processor and interprocessor communication is carried out using the Message Passing Library/Interface (MPL/MPI). Integration of Maxwell's equations is performed using the four-stage Runge-Kutta time integration method on a dual grid. This method of integrating on a staggered grid gives enhanced dissipative and dispersive characteristics. A scattered field formulation has been used and the Liao boundary condition is used at the outer nonreflecting boundary. The far zone transformation has also been implemented efficiently, using specialized MPL functions to evaluate the far zone scattering results. Results show extremely good comparisons for scattering from the sphere and the ogive with the exact solution and standard FDTD type algorithms. Comparisons for nonaxisymmetric targets like the NASA almond with experimental data has also been found to be extremely good.}
}

@Article{GorBi98,
author = "S. Gorlatch and H. Bischof",
title = "A Generic MPI Implementation for a Data-Parallel Skeleton: Formal Derivation and Application to FFT",
journal = "Parallel Processing Letters",
volume = 8,
number = 4,
month = DEC,
year = 1998,
pages = {447--458},
abstract = "We derive a provably correct, architecture-independent family of parallel implementations for a class of data-parallel algorithms, called DH (distributable homomorphisms). The implementations are well-structured SPMD programs with group-wise personalized all-to-all exchange, directly realizable in MPI. As a case study, we systematically adjust the mathematical specification of the Fast Fourier Transform (FFT) to the DH format and, thereby, obtain a generic SPMD implementation for FFT. The target program includes FFT solutions used in practice -- the binary-exchange and the 2D- and 3D-transpose -- as special cases."
}
@Article{YevCinZhu98:mpi-groundwatersim,
author = {G. Yevi and P. Cinnella and X. Zhuang},
title = {On parallelizing a groundwater pollution simulator},
journal = {Applied Mathematics and Computation},
year = 1998,
volume = 89,
number = {1-3},
month = {Jan.-Feb.},
pages = {313--325},
abstract = {Domain decomposition strategies and computational mesh reordering are discussed for finite difference parallel simulations of groundwater contaminants transport. The parallel performance of point iterative methods traditionally used in groundwater pollution modelling is studied. The algorithms were implemented with red-black and wavefront reordering of the computational mesh. A standard conservative transport equation defined on a two-dimensional grid with Dirichlet boundary conditions was used for the analysis. Completely portable multiple instructions multiple data (MIMD) implementations of the algorithm were performed using message-passing interface (MPI). The runtimes of the algorithms are presented as a function of grid refinement and number of processors, and the communication overhead of the parallel simulation process is investigated, showing that the red-black reordering technique yields the best performance results. The method also provides higher efficiency and scalability when applied to large-scale problems. Optimal parameters are suggested for parallel simulation of groundwater pollution using finite difference schemes.}
}



@Article{Ian97:mpi-reducescatter,
author = {G. Iannello},
title = {Efficient algorithms for the reduce-scatter operation in {LogGP}},
journal = {IEEE Transactions on Parallel and Distributed Systesm},
year = 1997,
volume = 8,
number = 9,
month = SEP,
pages = {970--982},
abstract = {We consider the problem of efficiently performing a reduce-scatter operation in a message passing system. Reduce-scatter is the composition of an element-wise reduction on vectors of n elements initially held by n processors, with a scatter of the resulting vector among the processors. In this paper, we present two algorithms for the reduce-scatter operation, designed in LogGP. The first algorithm assumes an associative and commutative reduction operator and it is optimal in LogGP within a small constant factor. The second algorithm allows the reduction operator to be noncommutative, and it is asymptotically optimal when values to be combined are large arrays. To achieve these results, we developed a complete analysis of both algorithms in LogGP, including the derivation of lower bounds for the reduce-scatter operation, and the study of the m-item version of the problem, i.e., the case when the initial elements are vectors themselves. Reduce-scatter has been included as a collective operation in the MPI standard message passing library, and can be used, for instance, in parallel matrix-vector multiply when the matrix is decomposed by columns. To model a message passing system, we adopted the LogGP model, an extension of LogP that allows the modeling of messages of different length. While this choice makes the analysis somewhat more complex, it leads to more realistic results in the case of gather/scatter algorithms.}
}



@Article{YuaSalBalMel97:mpi-load-balancing,
author = {X. Yuan and G. Salisbury and D. Balsara and R. Melhem},
title = {A load balancing package on distributed memory systems and its application to particle-particle particle-mesh ({P3M}) methods},
journal = {Parallel Computing},
year = 1997,
volume = 23,
number = 10,
month = NOV,
pages = {1525--1544},
abstract = {We present a tool, Bisect, for balanced decomposition of spatial domains. In addition to applying a nested bisection algorithm to determine the boundaries of each subdomain, Bisect replicates a user specified zone along the boundaries of the subdomain in order to minimize future interactions between subdomains, Results of running the tool on the Cray T3D system using both shared memory operations and MPI communications are reported and discussed. In addition, Bisect is used in a parallel implementation of a particle-particle/particle-mesh (P3M) simulation program on the Cray T3D system. The performance of the P3M program with different load-balancing criteria is evaluated and compared. The results show that the use of the Bisect package balances the load efficiently and minimizes communication on the T3D massively parallel system.}
}


@Article{FosKohKriCho97:mpi-task-parallel,
author = {I. Foster and D. R. Kohr and R. Krishnaiyer and A. Choudhary},
title = {A library-based approach to task parallelism in a data-parallel language},
journal = {Journal of Parallel and Distributed Computing},
year = 1997,
volume = 45,
number = 2,
month = SEP,
pages = {148--158},
abstract = {Pure data-parallel languages such as High Performance Fortran version 1 (HPF) do not allow efficient expression of mixed task/data-parallel computations or the coupling of separately compiled data-parallel modules, In this paper, we show how these common parallel program structures can be represented, with only minor extensions to the HPF model, by using a coordination library based on the Message Passing Interface (MPI). This library allows data-parallel tasks to Exchange distributed data structures using falls to simple communication functions. We present microbenchmark results that characterize the performance of this library and that quantify the impact of optimizations that allow reuse of communication schedules in common situations, In addition, results from two-dimensional FFT, convolution, and multiblock programs demonstrate that the HPF/MPI library can provide performance superior to that of pore HPF, We conclude that this synergistic combination of two parallel programming standards represents a useful approach to task parallelism in a data-parallel framework, increasing the range of problems addressable in HPF without requiring complex compiler technology.}
}


@Article{BruGehRei97:mpi-resource-mgmt,
author = {M. Brune and J. Gehring and A. Reinefeld},
title = {Heterogeneous message passing and a link to resource management},
journal = {Journal of Supercomputing},
year = 1997,
volume = 11,
number = 4,
pages = {355--369},
abstract = {PLUS is a light-weight, extensible and efficient communication interface. with only four commands, PLUS is almost transparent to the application code. Our current implementation supports inter-process communication between PVM, MPI and PARIX, but it can be easily extended to other vendor-specific message passing Libraries. As PLUS has been designed for wide area networks, much effort has been spent on portability and on optimizing the communication speed across internet and also intranet links.}
}


@Article{Hor97,
author = {K. Hori},
title = {Supercomputer {SX-4} multinode system},
journal = {NEC Research \& Development},
year = 1997,
volume = 38,
number = 4,
pages = {461--473},
abstract = {The NEC supercomputer SX-4 multinode system series consists of two models, one being HIPPI (High Performance Parallel Interface)-connected model and the other IXS (Internode Crossbar Switch)-connected model. With the IXS, a proprietary high-speed crossbar switch, the HPC (High Performance Computing) up to 1 TFLOPS (Tera Flops) has been enabled by providing the most comprehensive environment for distributed parallel processing. This also means the world's first implementation of a clustered parallel processing. In this paper, we describe the functions of IXS hardware, the new operating system functions, MPI/SX the MPI (Message Passing Interface) processor and NQS/MPI which supports the close cooperation between NQS (Network Queuing System) batch processing system and MPI.}
}


@Article{Fac97:mpi-load-balance,
author = {A. Fachat and K. H. Hoffmann},
title = {Implementation of ensemble-based simulated annealing with dynamic load balancing under {MPI}},
journal = {Computer Physics Communications},
year = 1997,
volume = 107,
number = {1--3},
month = DEC,
pages = {49--53},
abstract = {This paper describes an implementation of Ensemble Based Simulated Annealing (EBSA) with dynamic load balancing. It is running under the MPI Message Passing Library allowing parallel execution on various types of computers. The load balancing is used to get maximum use of the available processing power, even on heterogeneous workstation clusters where the machines differ a lot in computing power.}
}


@Article{BarHau98:mpi-app,
author = {E. Baron and P. H. Hauschildt},
title = {Parallel implementation of the phoenix generalized stellar atmosphere program. {II}. Wavelength parallelization},
journal = {Astrophysical Journal},
year = 1998,
volume = 495,
number = {1 part 1},
month = MAR,
pages = {370--376},
abstract = {We describe an important addition to the parallel implementation of our generalized nonlocal thermodynamic equilibrium (NLTE) stellar atmosphere and radiative transfer computer program PHOENIX. In a previous paper in this series we described data and task parallel algorithms we have developed for radiative transfer, spectral line opacity, and NLTE opacity and rate calculations. These algorithms divided the work spatially or by spectral lines, that is, distributing the radial zones, individual spectral lines, or characteristic rays among different processors and employ, in addition, task parallelism for logically independent functions (such as atomic and molecular line opacities). For finite, monotonic velocity fields, the radiative transfer equation is an initial value problem in wavelength, and hence each wavelength point depends upon the previous one. However, for sophisticated NLTE models of both static and moving atmospheres needed to accurately describe, e.g., novae and supernovae, the number of wavelength points is very large (200,000-300,000) and hence parallelization over wavelength can lead both to considerable speedup in calculation time and the ability to make use of the aggregate memory available on massively parallel supercomputers. Here, we describe an implementation of a pipelined design for the wavelength parallelization of PHOENIX, where the necessary data from the processor working on a previous wavelength point is sent to the processor working on the succeeding wavelength point as soon as it is known. Our implementation uses a MIMD design based on a relatively small number of standard message passing interface (MPI) library calls and is fully portable between serial and parallel computers.}
}

@Article{Yas98:complex-flows,
author = {O. Yasar},
title = {A scalable model for complex flows},
journal = {Computers and Mathematics with Applications},
year = 1998,
volume = 35,
number = 7,
month = APR,
pages = {117-128},
abstract = {We describe a scalable parallel algorithm for numerical simulations of turbulent, radiative, magnetized, and reactive fluid + particle systems on message-passing distributed-memory computers. Accurate simulation of such complex flows has applications in engine combustion, industrial pulverized coal burners, astrophysics, inertial confinement fusion, nuclear systems, and many other strategically and economically important areas. Our algorithm has been developed based on a widely-used combustion code KIVA-3, a plasma and radiation hydrodynamics code R-MHD, a classical particle dynamics code CMDT, and a discrete ordinates particle transport code TORT. The development is being done on the Intel Paragon with PVM and MPI extensions. We report high levels of parallel efficiency and scalability (up to 1024 nodes) for a baseline engine test case, using our current message-passing reactive and turbulent flow code. The three-dimensional extension of radiation magnetohydrodynamics component is still being worked at and we hope to report further progress in the future.}
}


@Article{LepSchHei98:reactive-flow,
author = {J. Lepper and U. Schnell and K. R. G. Hein},
title = {Parallelization of a simulation code for reactive flows on the Intel Paragon},
journal = {Computers and Mathematics with Applications},
year = 1998,
volume = 35,
number = 7,
month = APR,
pages = {101-109},
abstract = {The paper shows the implementation of a 3D simulation code for turbulent how and combustion processes in full-scale utility boilers on an Intel Paragon XP/S computer. For the portable parallelization, an explicit approach is chosen using a domain decomposition method for the static subdivision of the numerical grid together with the SPMD programming model. The measured speedup for the presented case using a coarse grid is good, although some numerical requirements restrict the implemented message passing to strongly synchronized communication. On the Paragon, the NX message passing library is used for the computations. Furthermore, MPI and PVM are applied and their pros and cons on this computer are described. In addition to the basic message passing techniques for local and global communication, other possibilities are investigated. Besides the applicability of the vectorizing capability of the compiler, the influence of the I/O performance during computations is demonstrated. The scalability of the parallel application is presented for a refined discretization.}
}


@Article{Gor98:fft,
author = {S. Gorlatch},
title = {Programming with divide-and-conquer skeletons: A case study of {FFT}},
journal = {Journal of Supercomputing},
year = 1998,
volume = 12,
number = {1-2},
pages = {85-97},
}

@Article{Hio98:qcd,
author = {S. Hioki},
title = {{QCDMPI}---pure {QCD} Monte Carlo Simulation code with MPI},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 1998,
volume = 63,
month = APR,
pages = {1000--1002},
abstract = {In this paper, outline of QCDMPI is reported. Comparison of the performances on several parallel machines; AP1000, AP1000+, AP3000, Cenju-3, Paragon, SR2201 and Workstation Cluster, is also reported.}
}


@Article{Han98:mpi-eval,
author = {P. B. Hansen},
title = {An evaluation of the message-passing interface},
journal = {ACM Sigplan Notices},
year = 1998,
volume = 33,
number = 3,
month = MAR,
pages = {65--72},
abstract = {The Message-Passing Interface (MPI) is evaluated by rewriting message parallel programs for Householder reduction, matrix multiplication, and successive overrelaxation. The author concludes that MPI is a practical programming tool. It does, however, lack the elegance and security that can only be achieved by a parallel programming language.}
}


@Article{Iss98:cfd-precond,
author = {E. Issman},
title = {Non-overlapping preconditioners for a parallel implicit Navier-Stokes solver},
journal = {Future Generation Computer Systems},
year = 1998,
volume = 13,
number = {4--5},
month = MAR,
pages = {303-313},
abstract = {Parallel implicit iterative solution techniques are considered for application to a compressible hypersonic Navier-Stokes solver on unstructured meshes. The construction of parallel preconditioners with quasi-optimal convergence properties with respect to their serial counterpart is a key issue in the design of modern parallel implicit schemes, Two types of non-overlapping preconditioners are presented and compared. The first one is an additive Schwarz preconditioner requiring overlapping of the mesh and the second one is based on a Schur complement formulation. Both are using incomplete LU factorisation at the subdomain level but scale differently. Results are presented for computations on the Cray T3D under the message passing interface MPI. }
}


@Article{Bar98:migration,
author = {A. Barak},
title = {The MOSIX multicomputer operating system for high performance cluster computing},
journal = {Future Generation Computer Systems},
year = 1998,
volume = 13,
number = {4--5},
month = MAR,
pages = {361-372},
abstract = {The scalable computing cluster at Hebrew University consists of 88 Pentium II and Pentium-Pro servers that are connected by fast Ethernet and the Myrinet LANs. It is running the MOSIX operating system, an enhancement of BSD/OS with algorithms for adaptive resource sharing, that are geared for performance scalability in a scalable computing cluster. These algorithms use a preemptive process migration for load-balancing and memory ushering, in order to create a convenient multiuser time-sharing execution environment for HPC, particularly for applications that are written in PVM or MPI. This paper begins with a brief overview of MOSIX and its resource sharing algorithms. Then the paper presents the performance of these algorithms as well as the performance of several large-scale, parallel applications.}
}


@Article{Rei97:interop,
author = {A. Reinefeld},
title = {Communicating across parallel message-passing environments},
journal = {Journal of Systems Architecture},
year = 1997,
volume = 44,
number = {3--4},
month = DEC,
pages = {261--272},
abstract = {We present a small, extensible interface for the transparent communication between vendor-specific and standard message-passing environments. With only four new commands, existing parallel applications can make use of our PLUS communication interface, thereby allowing inter-process communication with other programming environments. Much effort has been spent in optimizing the communication speed across Internet and Intranet links. Our current implementation supports process communication between PVM, MPI, and PARIX. With only marginal additional effort, the interface can be adapted to support other message-passing environments as well.}
}

@Article{hom97:mpi-maxcup,
author = {S. Homer},
title = {Design and performance of parallel and distributed approximation algorithms for maxcut},
journal = {Journal of Parallel and Distributed Computing},
year = 1997,
volume = 41,
number = 1,
pages = {48--61},
month = OCT,
abstract = { We develop and experiment with a new parallel algorithm to approximate the maximum weight cut in a weighted undirected graph, Our implementation starts with the recent (serial) algorithm of Goemans and Williamson for this problem, We consider several different versions of this algorithm, varying the interior-point part of the algorithm in order to optimize the parallel efficiency of our method, Our work aims for an efficient, practical formulation of the algorithm with close-to-optimal parallelization. We analyze our parallel algorithm in the LogP model and predict linear speedup for a wide range of the parameters, We have implemented the algorithm using the message passing interface (MPI) and run it on several parallel machines. In particular, we present performance measurements on the IBM SP2, the Connection Machine CM5, and a cluster of workstations, We observe that the measured speedups are predicted well by our analysis in the LogP model, Finally, we test our implementation on several large graphs (up to 13,000 vertices), particularly on large instances of the Ising model.}
}

@Article{War:mpi-cluster,
author = {T. M. Warschko},
title = {ParaStation: Efficient parallel computing by clustering workstations: Design and evaluation},
journal = {Journal of Systems Architecture},
year = 1997,
volume = 44,
number = {3--4},
pages = {241--260},
month = DEC,
abstract = {ParaStation is a communications fabric for connecting off-the-shelf workstations into a supercomputer. The fabric employs technology used in massively parallel machines and scales up to 4096 nodes, ParaStation's user-level message passing software preserves the low latency of the fabric by taking the operating system out of the communication path, while still providing full protection in a multiprogramming environment. The programming interface presented by ParaStation consists of a UNIX socket emulation and widely used parallel programming environments such as PVM, P4, and MPI. Implementations of ParaStation using various platforms, such as Digitals AlphaGeneration workstations and Linux PCs, achieve end-to-end (process-to-process) latencies as low as 2 mu s and a sustained bandwidth of up to 15 Mbyte/s per channel, even with small packets. Benchmarks using PVM on ParaStation demonstrate real application performance of 1 GFLOP on an 8-node cluster.}
}

@Article{War98:mpi-cluster,
author = {T. M. Warschko},
title = {The {ParaStation} project: Using workstations as building blocks for parallel computing},
journal = {Information Sciences},
year = 1998,
volume = 106,
number = {3--4},
pages = {277--292},
month = MAY,
abstract = {The ParaStation communication fabric provides a high-speed communication network with user-level access to enable efficient parallel computing on workstation clusters. The architecture, implemented on off-the-shelf workstations coupled by the ParaStation communication hardware, removes the kernel and common network protocols from the communication path while still providing full protection in a multiuser, multiprogramming environment. The programming interface presented by ParaStation consists of a UNIX socket emulation and widely used parallel programming environments such as PVM, P4, and MPI. This allows porting a wide range of client/server and parallel applications to the ParaStation architecture. Implementations of ParaStation using various platforms, such as Digital's AlphaGeneration workstations and Linux PCs, achieve end-to-end (process-to-process) latencies as low as 2 mu s and a sustained bandwidth of up to 15 Mbyte/s per channel with small packets. Benchmarks using PVM on ParaStation demonstrate real application performance of 1 GFLOP on an 8-node cluster. }
}

@Article{Dan98:mpi-scheduling,
author = {M. A. R. Dantas},
title = {Efficient scheduling of {MPI} applications on networks of workstations},
journal = {Future Generation Computer Systems},
year = 1998,
volume = 13,
number = 6,
pages = {489--499},
month = MAY,
abstract = {The availability of a large number of workstations connected through a network can represent an attractive option for high-performance computing for many applications. The message-passing interface (MPI) software environment is an effort from many organisations to define a de facto message-passing standard. In other words, the original specification was not designed as a comprehensive parallel programming environment and some researchers agree that the standard should be preserved as simple and clean as possible. Nevertheless, a software environment such as MPI should have somehow a scheduling mechanism for the effective submission of parallel applications on network of workstations. This paper presents an alternative lightweight approach called Selective-MPI (S-MPI), which was designed to enhance the efficiency of the scheduling of applications on an MPI implementation environment.}
}

@Article{Cou98:mpi-c++,
author = {O. Coulaud},
title = {Para++: A high level {C++} interface for message passing},
journal = {Journal of Parallel and Distributed Computing},
year = 1998,
volume = 51,
number = 1,
pages = {46--62},
month = MAY,
abstract = {This paper describes a high level C++ interface for message passing applications. Our interface is built on top of PVM and MPI. The two main contributions are to allow a quicker design of parallel applications without any important drop of performances. We introduce two levels of tasks and use C++ streams for communications. We also present a performance study over both PVM and MPI to show the overhead of our implementation. Finally, we detail two applications based on the heat equation to explain how lPara++ call be used for SPMD and MPMD applications.}
}


@Article{Sal98:mpi-genetic,
author = {A. Salhi},
title = {Parallel implementation of a genetic-programming based tool for symbolic regression},
journal = {Information Processing Letters},
year = 1998,
volume = 66,
number = 6,
pages = {299-307},
month = JUN,
abstract = {We report on a parallel implementation of a tool for symbolic regression, the algorithmic mechanism of which is based on genetic programming, and communication is handled using MPI. The implementation relies on a random islands model (RIM), which combines both the conventional islands model where migration of individuals between islands occurs periodically and niching where no migration takes place. The system was designed so that the algorithm is synergistic with parallel/distributed architectures, and works to make use of processor time and minimum use of network bandwidth without complicating the sequential algorithm significantly. Results on an IBM SP2 are included. }
}

@Article{Har98:mpi-application,
author = {H. K. Harbury},
title = {Parallel computation for electronic waves in quantum corrals},
journal = {VLSI Design},
year = 1998,
volume = 6,
number = {1--4},
pages = {57--51},
abstract = {Recent scanning tunneling microscopy (STM) studies on the (111) faces of noble metals have directly imaged electronic surface-confined states and dramatic standing-wave patterns have been observed [1,2]. We solve for the local density of electronic states in these ''leaky'' quantum corral confinement structures using a coherent elastic scattering theory. We seek solutions of the two-dimensional Schrodinger equation compatible with non-reflecting boundary conditions which asymptotically satisfy the Sommerfeld radiation condition [11,14]. The large matrices generated by the discretization of realistic quantum corral structures require the use of sparse matrix methods. In addition, a parallel finite element solution was undertaken using the message passing interface standard (MPI) and the Portable, Extensible, Toolkit for Scientific Computation (PETSc) [5] for an efficient computational solution on both distributed and shared memory architectures. Our calculations reveal excellent agreement with the reported experimental dI/dV STM data.}
}

@Article{Jak98:mpi-application,
author = {U. Jakobus},
title = {Analysis of electromagnetic scattering problems by an iterative combination of {MoM} with {GMT} using {MPI} for the communication},
journal = {Microwave and Optical Technology Letters},
year = 1998,
volume = 19,
number = 1,
pages = {1--4},
month = SEP,
abstract = {A hybrid method is proposed combining the method of moments (MoM) with the generalized multipole technique (GMT) for the efficient analysis of electromagnetic radiation and scattering problems involving metallic as well as dielectric bodies. An iterative coupling scheme is applied so that only some small changes to the MoM and GMT formulations are required, making it very attractive for the combination of already existing MoM and GMT codes. During the iteration, the MoM and GMT processes are executed in parallel, and communication is done using the message-passing interface (MPI).}
}

@Article{Ril98:mpi-application,
author = {C. J. Riley},
title = {Distributed-memory computing with the {L}angley {A}erothermodynamic {U}pwind {R}elaxation {A}lgorithm {(LAURA)}},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
pages = {317--324},
month = APR-JUL,
abstract = {The Langley Aerothermodynamic Upwind Relaxation Algorithm (LAURA), a Navier-Stokes solver, has been modified for use in a parallel, distributed-memory environment using the Message-Passing Interface (MPI) standard. A standard domain decomposition strategy is used in which the computational domain is divided into subdomains with each subdomain assigned to a processor. Performance is examined on dedicated parallel machines and a network of desktop workstations. The effect of domain decomposition and frequency of boundary updates on performance and convergence is also examined for several realistic configurations and conditions typical of large-scale computational fluid dynamic analysis.}
}


@Article{Wan98:mpi-application,
author = {P. Wang},
title = {Massively parallel finite volume computation of three-dimensional thermal convective flows},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
pages = {307--315},
month = APR-JUL,
abstract = {A parallel implementation of the finite volume method for three-dimensional, time-dependent, thermal convective flows is presented. The algebraic equations resulting from the finite volume discretization are solved by a parallel multigrid method. A flexible parallel code has been implemented on distributed-memory systems, by using domain decomposition techniques and the MPI communication software. The code uses one-, two- or three-dimensional partition according to different geometries. It currently runs on the Intel Paragon, the Cray T3D, T3E, the IBM SP2 and the Beowulf systems, which can be ported easily to other parallel systems. A comparison of the wallclock time of the code between these systems is made, and code performances with respect to different numbers of processors are presented.}
}

@Article{Dan98:mpi-application,
author = {K. T. Danielson},
title = {Nonlinear dynamic finite element analysis on parallel computers using {FORTRAN} 90 and {MPI}},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
pages = {179--186},
month = APR-JUL,
abstract = {A nonlinear explicit dynamic finite element code for use on scalable computers is presented. The code was written entirely in FORTRAN 90, but uses MPI for all interprocessor communication. Although MPI is not formally a standard for FORTRAN 90, the code runs properly in parallel on CRAY T3E, IBM SP, and SGI ORIGIN 2000 computing systems. Issues regarding the installation, portability, and effectiveness of the FORTRAN 90-MPI combination on these machines are discussed. An algorithm that overlaps message passing and computations of the explicit finite element equations is also presented and evaluated. Several large-scale ground-shock analyses demonstrate the varying combined importance of load balance and interprocessor communication among the different computing platforms. The analyses were performed on only a few to hundreds of processors with excellent speedup and scalability.}
}


@Article{Vat98:mpi-application,
author = {V. N. Vatsa},
title = {Viscous pow computations for complex geometries on parallel computers},
journal = {Advances in Engineering Software},
year = 1998,
volume = 29,
number = {3--6},
month = APR-JUL,
abstract = {A widely used computational fluid dynamics (CFD) code known as TLNS3D, which was developed for large, shared-memory computers, is ported to a distributed computing environment. An engineering approach is used here to parallelize this code so that minimal deviation from the original (non-parallel) code is incurred. A natural partitioning along grid blocks is adopted in which one or more blocks are distributed to each of the available processors. An automatic, static load-balancing strategy is employed for equitable distribution of computational work to specified processors. The message passing interface (MPI) protocols are incorporated for data communication. Both synchronous and asynchronous communication modes have been incorporated. As the number of processors is increased, the asynchronous communication mode shows much better scalability and clearly outperforms the synchronous mode of communication.}
}

@Article{Riv98:mpi-application,
author = {W. RiveraGallego},
title = {A genetic algorithm for circulant Euclidean distance matrices},
journal = {Applied Mathematics and Computation},
year = 1998,
volume = 97,
number = {2--3},
pages = {197--208},
month = DEC,
abstract = {This paper presents a fast genetic algorithm to determine three-dimensional configurations of points that generate circulant Euclidean Distance Matrices (EDMs). A parallel implementation is possible by using the message passing interface (MPI) standard. In addition, theoretical results about the polyhedral structure of both the cone of circulant symmetric positive semidefinite matrices and the cone of circulant EDMs are introduced.}
}

@Article{Ada98:mpi-application,
author = {P. Adamidis},
title = {Steel strip production --- a pilot application for coupled simulation with several calculation systems},
journal = {Journal of Materials Processing Technology},
year = 1998,
volume = {80-1},
pages = {330--336},
month = AUG-SEP,
abstract = {For the simulation of technological and natural processes in specific application domains, efficient calculation software solving differential equation systems on grid-based computational models is available, especially in the area of computer-aided engineering (CAE). To handle a so-called 'multiphysics' problem, for example the fluid flow and metal forming process in a twin-roll casting arrangement for steel strip production, several calculation systems usually have to be employed in a high-performance computing environment, e.g. on parallel computers. The GRISSLi Coupling Interface is a software tool facilitating the coupled computation based on the message passing standard MPI.}
}


@Article{Dow98:mpi-implementation,
author = {P. W. Dowd},
title = {{BLAST}: broadband lightweight {ATM} secure transport for high-performance distributed computing},
journal = {Computer Communications},
year = 1998,
volume = 21,
number = 12,
pages = {1040--1057},
month = AUG,
abstract = {This paper investigates the use of ATM for cluster-based computing. The need for a native ATM API is discussed as well as the performance of message passing libraries (MPL) that are written to use such an API to exploit the advantages of a high-speed network for cluster-based computing. The MPLs offer a standard interface, such as PVM or MPI, and interoperate with existing TCP/IP- and UDP/IP-based versions in addition to the ATM API environment. The interoperability extensions made to two MPLs, MPI and Prowess, which allow a hybrid environment of both ATM and TCP-based legacy network technology will be described. Shared object space (SOS), an extension to the MPLs, is described that helps support the geographically distributed computing (GDC) environment through latency hiding. It allows a user to develop applications in a shared memory type of environment. The native ATM API which supports cluster-based computing is described in this paper. This API provides a reliable transport interface to the MPL which has been optimized for an ATM environment. The transport protocol is a low-state design that optimizes the performance based on the available bandwidth, buffer constraints, propagation delay characteristics and security requirements of a particular connection.}
}

@Article{Kac98:mpi-tool,
author = {P. Kacsuk},
title = {{GRADE}: A graphical programming environment for multicomputers},
journal = {Computers and Artificial Intelligence},
year = 1998,
volume = 17,
number = 5,
pages = {417--427},
abstract = {To provide high-level graphical support for developing message passing programs, an integrated programming environment (GRADE) is being developed. GRADE currently provides tools to construct, execute, debug, monitor and visualise message-passing based parallel programs. GRADE offers the programmer an integrated graphical user interface during the whole life-cycle of program development and provides high-level graphical programming abstraction mechanisms to construct parallel applications. The current version of GRADE can generate C+PVM code but there is no theoretical obstacle to extend it for supporting MPI [9] and FORTRAN. Those new features of the GRADE graphical environment are described in the paper that enhanced GRADE towards a professional parallel programming environment.}
}

@Article{Ras98:mpi-application,
author = {J. Rasch},
title = {6-dimensional integrals and supercomputers},
journal = {Computer Physics Communications},
year = 1998,
volume = 114,
number = {1--3},
pages = {378--384},
month = NOV,
abstract = {Recently, a numerical method has been developed for the evaluation of general 6-dimensional integrals (6DIME), which has been successfully applied to the study of (e,2e) and (gamma,2e) processes. Details of the parallelization of that code are given using MPI and the scaling behaviour with respect to the number of nodes is presented. Almost full load balancing is obtained.The method is extended to include two centre scattering problems.}
}

@Article{Chu98:mpi-balancing,
author = {Y. Chung},
title = {An asynchronous algorithm for balancing unpredictable workload on distributed-memory machines},
journal = {ETRI Journal},
year = 1998,
volume = 20,
number = 4,
pages = {346--360},
month = DEC,
abstract = {It is challenging to parallelize problems with irregular computation and communication. In this paper, we propose an asynchronous algorithm for balancing unpredictable workload on distributed-memory machines. By using an initial workload estimate, we first partition the computations such that the workload is distributed evenly across the processors. In addition, we performtask migrations dynamically for adapting to the evolving workload. To demonstrate the usefulness of our load balancing strategy, we conducted experiments on an IBM SP2 and a Cray T3D. Experimental results show that our task migration strategy can balance unpredictable workload with little overhead. Our code using C and MPI is portable onto other distributed-memory machines.}
}

@Article{Ber99:mpi-tools,
author = {M. Bertozzi},
title = {Tools for code optimization and system evaluation of the image processing system {PAPRICA-3}},
journal = {Journal of Systems Architecture},
year = 1999,
volume = 45,
number = {6--7},
pages = {519--542},
month = JAN,
abstract = {This paper presents the complex environment that was built to ease the prototyping of real-time applications on the PAPRICA-3 massively parallel system. Applications are developed in C++ using high level data types and the corresponding Assembly code is automatically created by a code generator. A stochastic code optimizer takes the assembly code and improves it according to a genetic approach; due to the high computational power required by thisapproach, the stochastic code optimizer was implemented with MPI and runs in parallel on a cluster of workstations. The availability of this complex environment allowed to test the performance of the system and to tune it according to some target applications before the actual development of the hardware. For this purpose a system-level simulator was also built to determine the number of clock cycles required to run a specific segment of code. The whole environment has been used to validate possible solutions for the hardware system and to develop, test, and tune several real-time image processing applications. The hardware system is now completely defined.}
}

@Article{Lee99:mpi-applicatin,
author = {P. C. S. Lee},
title = {On the parallelization of a global climate-chemistry modeling system},
journal = {Atmospheric Environment},
year = 1999,
volume = 33,
number = 4,
pages = {675--681},
month = FEB,
abstract = {Coupled climate-chemistry simulations are computationally intensive owing to the spatial and temporal scope of the problem. In global chemistry models, the time integrations encountered in the chemistry and aerosol modules usually comprise the major CPU consumption. Parallelization of these segmentsof the code can contribute to multifold CPU speed-ups with minimal modification of the original serial code. This technical note presents a single program-multiple data (SPMD) strategy applied to the time-split chemistry modules of a coupled climate - global tropospheric chemistry model. Latitudinal domain decomposition is adopted along with a dynamic load-balancing technique that uses the previous time-step's load/latitude estimates for distributing the latitude bands amongst the processors. The coupled model is manually parallelized using the Message Passing Interface standard (MPI) on a distributed memory platform (IBM-SP2), Load-balancing efficiencies and the associated MPI overheads are discussed. Overall speed-ups and efficiencies are also calculated for a series of runs employing up to eight processors.}
}

@Article{May99:mpi-application,
author = {F. May},
title = {Mathematical modelling of glass melting furnace design with regard to {NOx} formation},
journal = {Glastechnische Berichte-Glass Science and Technology},
year = 1999,
volume = 72,
number = 1,
pages = {1--6},
month = JAN,
abstract = {A three-dimensional mathematical model for turbulent flow and combustion onthe basis of turbulence/chemistry interactions and radiative heat transfertaking into account spectral effects of surrounding walls and combustion gases is described. For this the transport equation for radiative intensity was split into different wavelength ranges. A block-structured finite volume grid with local refinements was used to solve the governing equations. The calculation domain is subdivided into a number of subdomains which are linked within the solver based on the Message Passing Interface library. Computed distributions of velocity, temperature, and heat fluxes are given. Results of a parametric study in a producing horseshoe furnace by increasing the height of the furnace with regard to NOx concentration distributions are presented.}
}

@Article{Reu99:mpi-application,
author = {J. Reuther},
title = {Aerodynamic shape optimization of supersonic aircraft configurations via anadjoint formulation on distributed memory parallel computers},
journal = {Computers and Fluids},
year = 1999,
volume = 28,
number = {4--5},
pages = {675--700},
month = MAY-JUN,
abstract = {This work describes the application of a control theory-based aerodynamic shape optimization method to the problem of supersonic aircraft design. A high fidelity computational fluid dynamics (CFD) algorithm modelling the Euler equations is used to calculate the aerodynamic properties of complex three-dimensional aircraft configurations. The design process is greatly accelerated through the use of both control theory and parallel computing. Control theory is employed to derive the adjoint differential equations whose solution allows for the evaluation of design gradient information at a fraction of the computational cost required by previous design methods. The resulting problem is then implemented in parallel using a domain decomposition approach, an optimized communication schedule, and the Message Passing Interface (MPI) Standard for portability and efficiency. In our earlier studies, the serial implementation of this design method, was shown to be effective for the optimization of airfoils, wings, wing-bodies, and complex aircraft configurations using both the potential equation and the Euler equations. In this work, our concern will be to extend the methodologies such that the combined capabilities of these new technologies can be used routinely and efficiently in an industrial design environment. The aerodynamic optimization of a supersonic transport configuration is presented as a demonstration test case of the capability, A particular difficulty of this test case is posed by the close coupling of the propulsion/airframe integration.}
}

@Article{Vat99:mpi-application,
author = {V. N. Vatsa},
title = {Parallelization of a multiblock flow code: an engineering implementation},
journal = {Computers and Fluids},
year = 1999,
volume = 38,
number = {4--5},
pages = {603--614},
month = MAY-JUN,
abstract = {Current trends in computer hardware are dictating a gradual shift toward the use of clusters of relatively inexpensive but powerful workstations, or massively parallel processing (MPP) machines, for scientific computing. However, most computational fluid dynamics (CFD) codes in use today were developed for large, shared-memory machines and are not readily portable to the distributed computing environment. One major hurdle in porting CFD codes to distributed computing platforms is the difficulty encountered in partitioning the problem so that the computation-to-communication ratio for each compute node (process) is maximized and the idle time during which one node waits for other nodes to transfer data is minimized. In the present work, pertinent issues involved in the parallelization of a widely used multiblock Navier-Stokes code TLNS3D are discussed. An engineering; approach is used here to parallelize this code so that minimal deviation from the original (nonparallel) code is incurred. A natural partitioning along grid blocks is adopted in which one or more blocks are distributed to each of the available nodes. An automatic, static load-balancing strategy is employed for equitable distribution of computational work to specified nodes. Both parallel Virtual machine (PVM) and message passing interface (MPI) protocols are incorporated for data communication to allow maximum portability to a wide range of computer configurations. Results are presented that are comparable with apriori estimates of performance for distributed computing and that are competitive in terms of central processing unit (CPU) time and wall time usagewith large, shared-memory supercomputers.}
}

@Article{Dzw99:mpi-application,
author = {W. Dzwinel},
title = {Method of particles in visual clustering of multi-dimensional and large data sets},
journal = {Future Generation Computer Systems},
year = 1999,
volume = 15,
number = 3,
pages = {365--379},
month = APR,
abstract = {A method dedicated for visual clustering of N-dimensional data sets is presented. It is based on the classical feature extraction technique - the Sammon's mapping. This technique empowered by a particle approach used in the Sammon's criterion minimization makes the method more reliable, general and efficient. To show its reliability, the results of tests are presented, which were made to exemplify the algorithm 'immunity' from data errors. The general character of the method is emphasized and its role in multicriterial analysis discussed. Due to inherent parallelism of the methods, which are based on the particle approach, the visual clustering technique can be implemented easily in parallel environment. It is shown that parallel realization of the mapping algorithm enables the visualization of data sets consisting of more than 10(4) multi-dimensional data points. The method was tested in the PVM, MPI and data parallel environments on an HP/Convex SPP/1600. In this paper, the authors compare the parallel algorithm performance for these three interfaces. The approach to visual clustering, presented in the paper, can be used in visualization and analysis of large multi-dimensional data sets. }
}

@Article{Wan99:mpi-application,
author = {P. Wang},
title = {Parallel multigrid finite volume computation of three-dimensional thermal convection},
journal = {Computers and Mathematics with Applications},
year = 1999,
volume = 37,
number = 9,
pages = {49-60},
month = MAY,
abstract = {A parallel implementation of the finite volume method for three-dimensional, time-dependent, thermal convective flows is presented. The algebraic equations resulting from the finite volume discretization, including a pressureequation which consumes most of the computation time, are solved by a parallel multigrid method. A flexible parallel code has been implemented on theIntel Paragon, the Cray T3D, and the IBM SP2 by using domain decompositiontechniques and the MPI communication software. The code can use 1D, 2D, or3D partitions as required by different geometries, and is easily ported toother parallel systems. Numerical solutions for air (Prandtl number Pr = 0.733) with various Rayleigh numbers up to 10(7) are discussed.}
}


@Article{Bar99:mpi-application,
author = {S. T. Barnard},
title = {An {MPI} implementation of the {SPAI} preconditioner on the {T3E}},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 2,
pages = {107--123},
month = {Summer},
abstract = {The authors describe and test spai-1.1, a parallel MPI implementation of the sparse approximate inverse (SPAI) preconditioner. They show that SPAI canbe very effective for solving a set of very large and difficult problems on a Cray T3E. The results clearly show the value of SPAI (and approximate inverse methods in general) as the Viable alternative to ILU-type methods when facing very large and difficult problems. The authors strengthen this conclusion by showing that spai-1.1 also has very good scaling behavior.}
}

@Article{Ree99:mpi-application,
author = {J. S. Reeve},
title = {An efficient parallel version of the {Householder-QL} matrix diagonalisation algorithm},
journal = {Parallel Computing},
year = 1999,
volume = 25,
number = 3,
pages = {311-319},
month = MAR,
abstract = {In this paper we report an effective parallelisation of the Householder routine for the reduction of a real symmetric matrix to tri-diagonal form and the QL algorithm for the diagonalisation of the resulting matrix. The Householder algorithm scales like alpha N-3/P + beta N(2)log(2)(P) and the QL algorithm like gamma N-2 + delta N-3/P as the number of processors P is increased for fixed problem size. The constant parameters alpha, beta, gamma anddelta are obtained empirically. When the eigenvalues only are required theHouseholder method scales as above while the QL algorithm remains sequential. The code is implemented in c in conjunction with the message passing interface (MPI) libraries and verified on a sixteen node IBM SP2 and for realmatrices that occur in the simulation of properties of crystaline materials.}
}

@Article{Gen99:mpi-application,
author = {C. Gennaro},
title = {Parallelising the Mean Value Analysis algorithm},
journal = {Transactions of the Society for Computer Simulation International},
year = 1999,
volume = 16,
number = 1,
pages = {16--22},
month = MAR,
abstract = {The Mean Value Analysis (MVA) algorithm is one of the most popular for evaluating the performance of separable (or product-form) queueing networks. Although its complexity is modest when jobs are indistinguishable, the introduction of different customer classes rapidly increases is computational cost. The problems of parallelising the algorithm while retaining its conceptual simplicity are examined. In particular, a parallel implementation of MVAon a distributed memory machine is developed using the MPI library for communication.}
}

@Article{Ble99:mpi-application,
author = {G. E. Blelloch},
title = {Design and implementation of a practical parallel {D}elaunay algorithm},
journal = {Algorithmica},
year = 1999,
volume = 24,
number = {3--4},
pages = {243--269},
month = JUL-AUG,
abstract = {Initial experiments using a variety of distributions showed that our parallel algorithm was within a factor of 2 in work from the best sequential algorithm. Based on these promising results, the algorithm was implemented using C and an MPI-based toolkit. Compared with previous work, the resulting implementation achieves significantly better speedups over good sequential code, does not assume a uniform distribution of points, and is widely portable due to its use of MPI as a communication mechanism. Results are presentedfor the IBM SP2, Cray T3D, SGI Power Challenge, and DEC AlphaCluster.}
}

@Article{Coe99:mpi-application,
author = {P. J. Coelho},
title = {Modelling of a utility boiler using parallel computing},
journal = {Journal of Supercomputing},
year = 1999,
volume = 13,
number = 2,
pages = {211-232},
month = MAR,
abstract = {A mathematical model for the simulation of the turbulent reactive flow and heat transfer in a power station boiler has been parallelized. The mathematical model is based on the numerical solution of the governing equations for mass, momentum, energy and transport equations for the scalar quantities.The k-epsilon model and the conserved scalar/prescribed probability density function formalism are employed. Radiative heat transfer is calculated using the discrete ordinates method. The code has been fully parallelized using the spatial domain decomposition approach and MPI. Calculations were performed using an IBM-SP2. It is shown that the computational requirements are reduced and the parallel efficiency increases if the mean temperature anddensity are calculated a priori, and stored. The role of the different parts of the code on the parallel performance is discussed. A speedup of 5.9 is achieved using 8 processors.}
}

@Article{Rus99:mpi-cluster,
author = {S. H. Russ},
title = {Using {Hector} to run {MPI} programs over networked workstations},
journal = {Concurrency Practice and Experience},
year = 1999,
volume = 11,
number = 4,
pages = {189--204},
month = APR,
abstract = {Networked workstations represent an increasingly popular distributed platform for running large parallel programs. They can present a low-cost alternative to purchasing supercomputer time or additional usable computational capability, Several capabilities are desirable in order to harness workstations, including support for a widely accepted parallel programming environment, task migration, intelligent resource allocation, fault tolerance, and totally transparent support of these features. The Hector system is designed to provide these capabilities to MPI programs. The structure of the system and experiences using the system on loaded workstations to run scientific codes are described.}
}

@Article{Ros99:mpi-tool,
author = {T. Rossi},
title = {SIAM Journal on Scientific Computing},
journal = {A parallel fast direct solver for block tridiagonal systems with separable matrices of arbitrary dimension},
year = 1999,
volume = 20,
number = 5,
pages = {1778-1796},
month = MAY,
abstract = {A parallel fast direct solution method for linear systems with separable block tridiagonal matrices is considered. Such systems appear, for example, when discretizing the Poisson equation in a rectangular domain using the five-point finite difference scheme or the piecewise linear finite elements ona triangulated, possibly nonuniform rectangular mesh. The method under consideration has the arithmetical complexity O(N log N), and it is closely related to the cyclic reduction method, but instead of using the matrix polynomial factorization, the so-called partial solution technique is employed. Hence, in this paper, the method is called the partial solution variant of the cyclic reduction method (PSCR method). The method is presented and analyzed in a general radix-q framework and, based on this analysis, the radix-4 variant is chosen for parallel implementation using the MPI standard. Thegeneralization of the method to the case of arbitrary block dimension is described. The numerical experiments show the sequential efficiency and numerical stability of the PSCR method compared to the well-known BLKTRI implementation of the generalized cyclic reduction method. The good scalability properties of the parallel PSCR method are demonstrated in a distributed-memory Cray T3E-750 computer.}
}

@Article{Bou99:mpi-algorithm,
author = {P. Boulet},
title = {Static tiling for heterogeneous computing platforms},
journal = {Parallel Computing},
year = 1999,
volume = 25,
number = 5,
pages = {547--568},
month = MAY,
abstract = {In the framework of fully permutable loops, tiling has been extensively studied as a source-to-source program transformation. However, little work hasbeen devoted to the mapping and scheduling of the tiles on physical processors. Moreover, targeting heterogeneous computing platforms has to the best of our knowledge, never been considered. In this paper we extend static tiling techniques to the context of limited computational resources with different-speed processors. In particular, we present efficient scheduling and mapping strategies that are asymptotically optimal. The practical usefulness of these strategies is fully demonstrated by MPI experiments on a heterogeneous network of workstations.}
}

@Article{Ros99:mpi-application,
author = {I. Rosenblum},
title = {Multi-processor molecular dynamics using the {Brenner} potential: Parallelization of an implicit multi-body potential},
journal = {International Journal of Modern Physics C},
year = 1999,
volume = 10,
number = 1,
pages = {189--203},
month = FEB,
abstract = {We present computational aspects of Molecular Dynamics calculations of thermal properties of diamond using the Brenner potential. Parallelization was essential in order to carry out these calculations on samples of suitable sizes. Our implementation uses MPI on a multi-processor machine such as the IBM SP2. Three aspects of parallelization of the Brenner potential are discussed in depth. These are its long-range nature, the need for different parallelization algorithms for forces and neighbors, and the relative expense of force calculations compared to that of data communication. The efficiency of parallelization is presented as a function of different approaches to these issues as well as of cell size and number of processors employed in the calculation. In the calculations presented here, information from almosthalf of the atoms were needed by each processor even when 16 processors were used. This made it worthwhile to avoid unnecessary complications by making data from all atoms available to all processors. Superlinear speedup wasachieved for four processors (by avoiding paging) with 512 atom samples, and 5ps long trajectories were calculated (for 5120 atom samples) in 53 hours using 16 processors; 514 hours would have been needed to complete this calculation using a serial program. Finally, we discuss and make available a set of routines that enable MPI-based codes such as ours to be debugged on scalar machines.}
}

@Article{Luo99:mpi-comparision,
author = {Y. Luo},
title = {Shared memory vs. message passing: the {COMOPS} benchmark experiment},
journal = {Journal of Supercomputing},
year = 1999,
volume = 13,
number = 3,
pages = {283--301},
month = MAY,
abstract = {This paper presents the comparison of the COMOPS benchmark performance in MPI and shared memory on four different shared memory platforms: the DEC AlphaServer 8400/300, the SGI Power Challenge, the SGI Origin2000, and the HP-Convex Exemplar SPP1600. The paper also qualitatively analyzes the obtained performance data based on an understanding of the corresponding architecture and the MPI implementations. Some conclusions are made for the inter-processor communication performance on these four shared memory platforms.}
}


@Article{Hio99:mpi-application,
author = {S. Hioki},
title = {{QCDimMPI: MPI} code for {QCD} with an improved action},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 1999,
volume = 73,
pages = {895--897},
month = MAR,
abstract = {QCDimMPI[I] is a simulation code for pure SU(3) gauge theory with an improved action consisting of 1 x 1 and 2 x 1 plaquettes. It uses Fortran77 and the Message Passing Interface Standard, MPI[2]. QCDimMPI is an extended version of QCDMPI. It is portable, allows simulations in any number of dimensions, on any number of processors, and with arbitrary dimensional partitioning. It requires a rather small working area, and yields excellent performance on single processor computers and a wide variety of parallel computers which support MPI. The program provides information on link update time and communications time. In this paper, an outline of QCDimMPI is given, and benchmark results on several parallel computers are reported.}
}


@Article{Gol99:mpi-application,
author = {A. Goller},
title = {Parallel processing strategies for large {SAR} image data sets in a distributed environment},
journal = {Computing},
year = 1999,
volume = 62,
number = 4,
pages = {277-291},
abstract = {Key algorithms like image matching and Shape-from-Shading were parallelizedmainly using MPI, and ported onto suitable computer architectures. Our experiments showed that all algorithms perform well, and they further proved the concept of CDIP to be beneficial: Usability of all integrated algorithmswas significantly improved, mainly due to less user-centered network traffic, simple access to supercomputers, the creation of method sequences, and easy-to-use and well maintained algorithms.}
}

@Article{Chi99:mpi-implementation,
author = {A. Chien},
title = {Design and evaluation of an {HPVM}-based windows {NT} supercomputer},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 3,
pages = {201--219},
month = {Fall},
abstract = {We describe the design and evaluation of a 192-processor Windows NT clusterfor high performance computing based on the High Performance Virtual Machine (HPVM) communication suite. While other clusters have been described in the literature, building a 58 GFlop/s NT cluster to be used as a general-purpose production machine for NCSA required solving new problems. The HPVM software meets the challenges represented by the large number of processors,the peculiarities of the NT operating system, the need for a production-strength job submission facility and the requirement for mainstream programming interfaces. First, HPVM provides users with a collection of standard APIs like MPI, Shmem, Global Arrays with supercomputer class performance (13 mu s minimum latency, 84 MB/s peak bandwidth for MPI), efficiently delivering Myrinet's hardware performance to application programs. Second, HPVM provides cluster management and scheduling (through integration with Platform Computing's LSF). Finally, HPVM addresses Windows NT's remote access problem, providing convenient remote access and job control (through a graphical Java-applet front-end). Given the production nature of the cluster, the performance characterization is largely based on a sample of the NCSA scientific applications the machine will be running. The side-by-side comparison with other present-generation NCSA supercomputers shows the cluster to be within a factor of 2 to 4 of the SGI Origin 2000 and Cray T3E performance at a fraction of the cost. The inherent scalability of the cluster design produces a comparable or better speedup than the Origin 2000 despite a limitationin the HPVM flow control mechanism.}
}


@Article{Ros99:mpi-tools,
author = {T. Rossi},
title = {Parallel fictitious domain method for a non-linear elliptic {Neumann} boundary value problem},
journal = {Numerical Linear Algebra with Applications},
year = 1999,
volume = 6,
number = 1,
pages = {51--60},
month = JAN-FEB,
abstract = {Parallelization of the algebraic fictitious domain method is considered forsolving Neumann boundary value problems with variable coefficients. The resulting method is applied to the parallel solution of the subsonic full potential flow problem which is linearized by the Newton method. Good scalability of the method is demonstrated on a Cray T3E distributed memory parallel computer using MPI in communication.}
}


@Article{Zak99:mpi-tools,
author = {O. Zaki},
title = {Toward scalable performance visualization with Jumpshot},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 3,
pages = {277-288},
month = {Fall},
abstract = {Jumpshot is a graphical tool for understanding the performance of parallel programs. It is in the tradition of the upshot tool but contains a number of extensions and enhancements that make it suitable for large-scale parallel computations. Jumpshot takes as input a new, more flexible logfile formatand comes with a library for generating such logfiles. An MPI profiling library is also included, enabling the automatic generation of such logfiles from MPI programs. Jumpshot is written in Java and can easily be integratedas an applet into browser-based computing environments. The most novel feature of Jumpshot is its automatic detection of anomalous durations, drawingthe user's attention to problem areas in a parallel execution. This capability is particularly useful in large-scale parallel computations containingmany events.}
}

@Article{BegVin99:transport,
author = {S. Bergeron and A. Vincent},
title = {Implementation strategies for real-time particle transport solver},
journal = {Computer Physics Communications},
year = 1999,
volume = 120,
number = {2--3},
month = AUG,
pages = {177-184},
abstract = {Many problems in physics and engineering involve the transport of solid particles in a turbulent field. In some cases, it is desirable to study the transport of those particles in "real time". The prediction of erosion in therotating part of hydraulic turbines is such a problem. This paper presentsa semi-analytic predictor-corrector scheme adapted to the case of a rotating frame of reference. Simplification, related to the interpolation scheme required, is discussed as well as a parallel implementation using MPI on 10Base-T Ethernet interconnected workstations. The 3D solver is coupled with a high performance visualization software. Performance then shows a quasi-linear speedup.}
}



@Article{BruFagRes99:meta,
author = {M. A. Brune and G. E. Fagg and M. M. Resch},
title = {Message-passing environments for metacomputing},
journal = {Future Generation Computer Systems},
year = 1999,
volume = 15,
number = {5--6},
month = OCT,
pages = {699-712},
abstract = {The PACX-MPI approach offers a transparent interface for the communication between two or more MPI environments. PVAMPI allows the user spawning parallel processes under the MPI environment. The PLUS protocol bridges the gap between vendor-specific (e.g., MPL, NX, and PARIX) and vendor-independent message-passing environments (e.g., PVM and MPI). Moreover, it offers the ability to create and control processes at application runtime.}
}

@Article{ResRanSto99:meta,
author = {M. M. Resch and D. Rantzau and R. Stoy},
title = {Metacomputing experience in a transatlantic wide area application test-bed},
journal = {Future Generation Computer Systems},
year = 1999,
volume = 15,
number = {5--6},
month = OCT,
pages = {807--816},
abstract = {In the frame of a G7 initiative the High Performance Computing Center Stuttgart (HLRS) together with the Pittsburgh Supercomputing Center (PSC) and Sandia National Laboratories (SNL) has set up a transatlantic wide area application test-bed in 1997. A dedicated ATM-Link was installed that connected German research networks to vBNS and ESnet. During 1 year this test-bed wasextensively used for metacomputing and collaborative working. Two applications - one from computational fluid dynamics and one from molecular dynamics - were adapted and run on the test-bed. For message-passing an MPI library was implemented that supports metacomputing. An already existing softwarefor collaborative visualization was adapted for that scenario. This article describes the technical background of the cooperation, results that have been achieved for the two applications so far and lessons that have been learned. Special emphasis will be given to future work planned.}
}


@Article{Tho99:mpi-application,
author = {S. J. Thomas and M. Desgagne and R. Benoit},
title = {A real-time north American forecast at 10-km resolution with the {C}anadian {MC2 Meso-LAM}},
journal = {Journal of Atmospheric and Oceanic Technology},
year = 1999,
volume = 16,
number = 8,
pages = {1092-1101},
month = AUG,
abstract = {The next generation of high-performance computers will be based on clustersof shared-memory symmetric multiprocessor (SMP) nodes interconnected by a low-latency, high-bandwidth network. In this paper, the parallel performance of the nonhydrostatic Mesoscale Compressible Community (MC2) limited-areaatmospheric model on clusters of NEC SX-4 symmetric multiprocessor (SMP) nodes is presented. Several hybrid parallel-programming approaches are now possible with the SMP cluster SC-MC2 implementation based on internode MPI message-passing and intranode shared-memory tasking or threads. At total sustained execution rates of between 25 and 30 Gflop s(-1) on single-node or multinode clusters, it is now possible for the first time ever to generate a24-48-h real-time weather forecast over North America at 10-km resolution.}
}


@Article{Rod99:mpi-evals,
author = {J. L. Roda and C. Rodriguez and D. G. Morales and E. Almeida},
title = {Predicting the execution time of message passing models},
journal = {Concurrency Practice and Experience},
year = 1999,
volume = 11,
number = 9,
month = AUG,
pages = {461--477},
abstract = {Recent publications prove that runtime systems oriented to the Bulk Synchronous Parallel Model usually achieve remarkable accuracy in their predictions, That accuracy can be seen in the capacity of the software for packing the messages generated during the superstep and their capability to find a rearrangement of the messages sent at the end of the superstep, Unfortunately, barrier synchronisation imposes some limits both in the range of available algorithms and in their performance, The asynchronous nature of many MPI/PVM programs makes their expression difficult or infeasible using a BSP oriented library. Through the generalisation of the concept of superstep we propose two extensions of the BSP model: the BSP Without Barriers (BSPWB) andthe Message Passing Machine (MPM) models, These new models are oriented toMPI/PVM parallel programming. The parameters of the models and their quality are evaluated on four standard parallel platforms, The use of these BSP extensions is illustrated using the Past Fourier Transform and the ParallelSorting by Regular Sampling algorithms.}
}

@Article{Lir99:mpi-apps,
author = {I. Lirkov and S. Margenov},
title = {{MPI} parallel implementation of {CBF} preconditioning for {3D} elasticity problems},
journal = {Mathematics and Computers in Simulation},
year = 1999,
volume = 50,
number = {1--4},
month = NOV,
pages = {247--254},
abstract = {New construction of a parallel algorithm for the discussed preconditioning method is proposed. The theoretical part of this study includes analysis ofthe execution time on various parallel architectures and asymptotic estimates of the parallel speedup and the parallel efficiency. The parallel performance estimates indicate that the proposed algorithm will be especially efficient on coarse-grain parallel systems, which is also confirmed by the numerical experiments. A portable MPI parallel code is developed. Numerical tests on three symmetric multiprocessor systems: SUN Enterprise 3000, SUN SPARCstation 10 and Origin 2000 are presented. The reported speedup and parallel efficiency illustrate well the features of the proposed method and its implementation. }
}

@Article{den99:mpi-app,
author = {L. Deng and Z. S. Xie},
title = {Parallelization of {MCNP} Monte Carlo neutron and photon transport code in parallel virtual machine and message passing interface},
journal = {Journal of Nuclear Science and Technology},
year = 1999,
volume = 36,
number = 7,
month = JUL,
abstract = {The coupled neutron and photon transport Monte Carlo code MCNP (version 3B)has been parallelized in parallel virtual machine (PVM) and message passing interface (MPI) by modifying a previous serial code. The new code has been verified by serving sample problems. The speedup increases linearly with the number of processors and the average efficiency is up to 99\% for 12-processor.}
}

@Article{Arp99:mpi-app,
author = {K. Arpe and E. Roechner},
title = {Simulation of the hydrological cycle over Europe: Model validation and impacts of increasing greenhouse gases},
journal = {Advances in Water Resources},
year = 1999,
volume = 23,
number = 2,
month = OCT,
pages = {105--119},
abstract = {Different methods of estimating precipitation area means, based on observations, are compared with each other to investigate their usefulness for model validation. For the applications relevant to this study the ECMWF reanalyses provide a good and comprehensive data set for validation. The uncertainties of precipitation analyses, based on observed precipitation or from numerical weather forecasting schemes, are generally in the range of 20\% but regionally much larger. The MPI atmospheric general circulation model is able to reproduce long term means of the main features of the hydrological cycle within the range of uncertainty of observational data, even for relatively small areas such as the Rhine river basin. Simulations with the MPI coupled general circulation model, assuming a further increase of anthropogenicgreenhouse gases, show clear trends in temperature and precipitation for the next century which would have significant implications for human activity, e.g. a further increase of the sea level of the Caspian Sea and less water in the Rhine and the Danube. We have gained confidence in these results because trends in the temperature and precipitation in the coupled model simulations up to the present are partly confirmed by an atmospheric model simulation forced with observed SSTs and by observational data. We gained further confidence because the simulations with the same coupled model but using constant greenhouse gases do not show such trends. However, doubts arisefrom the fact that these trends are strong where the systematic errors of the model are large.}
}

@Article{Yah99:mpi-app,
author = {Y. Yahagi and M. Mori and Y. Yoshii},
title = {The forest method as a new parallel tree method with the sectional Voronoi tessellation},
journal = {Astrophysical Journal Supplement Series},
year = 1999,
volume = 124,
number = 1,
month = SEP,
pages = {1--9},
abstract = {We have developed a new parallel tree method which will be called the forest method hereafter. This new method uses the sectional Voronoi tessellation(SVT) for the domain decomposition. The SVT decomposes a whole space into polyhedra and allows their flat borders to move by assigning different weights. The forest method determines these weights based on the load balancingamong processors by means of the overload diffusion (OLD). Moreover, sinceall the borders are hat, before receiving the data from other processors, each processor can collect enough data to calculate the gravity force with precision. Both the SVT and the OLD are coded in a highly vectorizable manner to accommodate on vector parallel processors. The parallel code based onthe forest method with the Message Passing Interface is run on various platforms so that a wide portability is guaranteed. Extensive calculations with 15 processors of Fujitsu VPP300/16R indicate that the code can calculate the gravity force exerted on 10(5) particles in each second for some ideal dark halo. This code is found to enable an N-body simulation with 10(7) or more particles for a wide dynamic range and is therefore a very powerful tool for the study of galaxy formation and large-scale structure in the universe.}
}

@Article{tan99:mpi-impl,
author = {H. Tang and K. Shen and T. Yang},
title = {Compile/run-time support for threaded {MPI} execution on multiprogrammed shared memory machines},
journal = {ACM SIGPLAN Notices},
year = 1999,
volume = 34,
number = 8,
month = AUG,
pages = {107--118},
abstract = {MPI is a message-passing standard widely used for developing high-performance parallel applications. Because of the restriction in the MPI computationmodel, conventional implementations on shared memory machines map each MPInode to an OS process, which suffers serious performance degradation in the presence of multiprogramming, especially when a space/time sharing policyis employed in OS job scheduling In this paper, we study compile-time and run-time support for MPI by using threads and demonstrate our optimization techniques for executing a large class of MPI programs written in C. The compile-time transformation adopts thread-specific data structures to eliminate the use of global and static variables in C code. The run-time support includes an efficient point-to-point communication protocol based on a novellock-free queue management scheme. Our experiments on an SGI Origin 2000 show that our MPI prototype called TMPI using the proposed techniques is competitive with SGI's native MPI implementation in a dedicated environment, and it has significant performance advantages with up to a 23-fold improvement in a multiprogrammed environment.}
}

@Article{kie99:mpi-collective,
author = {T. Kielmann and R. F. H. Hofman and H. E. Bal and A. Plaat and R. A. F. Bhoedjang},
title = {{MAGPIE: MPI}'s collective communication operations for clustered wide area systems},
journal = {ACM SIGPLAN Notices},
year = 1999,
volume = 34,
number = 8,
month = AUG,
pages = {131-140},
abstract = {Writing parallel applications for computational grids is a challenging task. To achieve good performance, algorithms designed for local area networks must be adapted to the differences in link speeds. An important class of algorithms are collective operations, such as broadcast and reduce. We have developed MAGPIE, a library of collective communication operations optimizedfor wide area systems. MAGPIE's algorithms send the minimal amount of dataover the slow wide area links, and only incur a single wide area latency. Using our system, existing MPI applications can be run unmodified on geographically distributed systems. On moderate cluster sizes, using a wide area latency of 10 milliseconds and a bandwidth of 1 MByte/s, MAGPIE executes operations up to 10 times faster than MPICH, a widely used MPI implementation; application kernels improve by up to a factor of 4. Due to the structure of our algorithms, MAGPIE's advantage increases for higher wide area latencies.}
}


@Article{zhu99:mpi-app,
author = {W. J. Zhu and L. Petzold},
title = {Parallel sensitivity analysis for {DAE}s with many parameters},
journal = {Concurrency-Practice and Experience},
year = 1999,
volume = 11,
number = 10,
month = AUG,
pages = {571--585},
abstract = {In this paper, we discuss the parallel computation of the sensitivity analysis of systems of differential-algebraic equations (DAEs) with a moderate number of state variables and a large number of sensitivity parameters, Several parallel implementations based on DASSLSO are explored and their performance when using the Message Passing Interface (MPI) on an SGI Origin 2000 is compared, }
}

@Article{Sun99:mpi-perf,
author = {D. Sundaram-Stukel and M. K. Vernon},
title = {Predictive analysis of a wavefront application using {LogGP}},
journal = {ACM SIGPLAN Notices},
year = 1999,
volume = 34,
number = 8,
month = AUG,
pages = {141-150},
abstract = {This paper develops a highly accurate LogGP model of a complex wavefront application that uses MPI communication on the IBM SP/2. Key features of the model include: (1) elucidation of the principal wavefront synchronization structure, and (2) explicit high-fidelity models of the MPI-send and MPI-receive primitives. The MPI-send/receive models are used to derive L, o, and Gfrom simple two-node micro-benchmarks, Other model parameters are obtainedby measuring small application problem sizes on four SP nodes. Results show that the LogGP model predicts, in seconds and with a high degree of accuracy, measured application execution time for large problems running on 128 nodes. Detailed performance projections are provided for very large future processor configurations that are expected to be available to the application developers. These results indicate that scaling beyond one or two thousand nodes yields greatly diminished improvements in execution time, and thatsynchronization delays are a principal factor limiting the scalability of the application.}
}

@Article{kimura99:mpi-app,
author = {T. Kimura and H. Takemiya},
title = {Distributed parallel computing for fluid structure coupled simulations on a heterogeneous parallel computer cluster},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 4,
pages = {320--333},
abstract = {Distributed parallel computing for a fluid-structure coupled simulation hasbeen performed on a heterogeneous parallel computer cluster. The fluid andthe structure dynamics are simulated on different parallel computers connected by a high-speed local network. These dynamics are coupled by a loose coupling method exchanging the boundary data between the fluid and the structure domains through the network. The data communication among parallel computers is realized by using the new communication library, Stampi, which has been developed to enable communication in a heterogeneous environment. The performance evaluation on a heterogeneous parallel computer cluster has shown that the distributed parallel computing for fluid-structure coupled simulations has the advantage of increasing the performance compared with theparallel computing on a single parallel computer.}
}


@Article{morrow99:mpi-app,
author = {P. J. Morrow and D. Crookes and J. Brown and G. McAleese and D. Roantree and I. Spence},
title = {Efficient implementation of a portable parallel programming model for image processing},
journal = {Concurrency-Practice and Experience},
year = 1999,
volume = 11,
number = 11,
month = SEP,
pages = {671--685},
abstract = {This paper describes a domain specific programming model for execution on parallel and distributed architectures. The model has initially been targeted at the application area of image processing, though the techniques developed may be more generally applicable to other domains where an algebraic orlibrary-based approach is common. Efficiency is achieved by the concept ofa self-optimising class library of primitive image processing operations, which allows programs to be written in a high level, algebraic notation andwhich is automatically parallelised (using an application-specific data parallel approach). The class library is extended automatically with optimised operations, generated by a transformation system, giving improved execution performance. The parallel implementation of the model described here is based on MPI and has been tested on a C40 processor network, a quad-processor Unix workstation, and a network of PCs running Linux. Timings are included to indicate the impact of the automatic optimisation facility (rather than the effect of parallelisation). }
}


@Article{byrne:mpi-app,
author = {G. D. Byrne and A. C. Hindmarsh},
title = {{PVODE}, an {ODE} solver for parallel computers},
journal = {International Journal of High Performance Computing Applications},
year = 1999,
volume = 13,
number = 4,
pages = {354--365},
abstract = {PVODE is a general-purpose solver for ordinary differential equation (ODE) systems that implements methods for both stiff and nonstiff systems. The code is designed for single-program multiple-data environments. It is writtenin ANSI standard C, with a highly modular structure. The version being distributed uses the message-passing interface (MPI) system for communication.In the stiff case, PVODE uses a backward differentiation formula method combined with preconditioned GMRES iteration. Parallelism is achieved by distributing the ODE solution vector into user-specified segments and parallelizing a set of vector kernels accordingly. For PDE-based ODE systems, we provide a module that generates a band block-diagonal preconditioner for use with the GMRES iteration. We also provide a set of interfaces to accommodateFortran applications. The paper includes a stiff example problem and test results on a Cray-T3D with three different message-passing systems. PVODE is publicly available.}
}


@Article{Coelho:mpi-app,
author = {P. J. Coelho},
title = {Parallel simulation of a utility boiler. Part {I}: Mathematical model and numerical solution method},
journal = {Communications in Numerical Methods in Engineering},
year = 1999,
volume = 15,
number = 10,
month = OCT,
pages = {717--726},
abstract = {A computer code for the modelling of turbulent reactive flows with heat transfer has been parallelized and applied to the simulation of a utility boiler. The code is based on the numerical solution of the density-weighted averaged form of the governing equations for mass, momentum and energy conservation, and transport equations for scalars associated with the turbulence and combustion models. The k-epsilon model and the chemical equilibrium approach are used. The turbulent fluctuations are accounted for in the calculation of the mean properties by means of a presumed joint probability densityfunction for the mixture fraction and the fraction of radiative heat loss.The discrete ordinates method is used for radiation modelling. The governing equations are solved using the finite volume method. The parallelizationis carried out using the domain decomposition approach and the message-passing MPI library. The paper is divided into two parts. This part is concerned with the description of the model and the parallel implementation, whilethe model evaluation and the analysis of the parallel performance are presented in Part II (pp. 727-736).}
}


@Article{Torres:mpi-app,
author = {D. J. Torres and E. A. Coutsias},
title = {Pseudospectral solution of the two-dimensional {N}avier-{S}tokes equations in a disk},
journal = {SIAM Journal on Scientific Computing},
year = 1999,
volume = 21,
number = 1,
month = SEP,
pages = {378--403},
abstract = {An efficient and accurate algorithm for solving the two-dimensional(2D) incompressible Navier-Stokes equations on a disk with no-slip boundary conditions is described. The vorticity-stream function formulation of these equations is used, and spatially the vorticity and stream functions are expressedas Fourier-Chebyshev expansions. The Poisson and Helmholtz equations whicharise from the implicit-explicit time marching scheme are solved as bandedsystems using a post-conditioned spectral tau-method. The polar coordinatesingularity is handled by expanding fields radially over the entire diameter using a parity modified Chebyshev series and building partial regularityinto the vorticity. The no-slip boundary condition is enforced by transferring one of the two boundary conditions imposed on the stream function ontothe vorticity via a solvability constraint. Significant gains in run timeswere realized by parallelizing the code in message passage interface (MPI).}
}


@Article{Ann99:mpi-app,
author = {V. Annamalai and C. S. Krishnamoorthy and V. Kamakoti},
title = {Adaptive finite element analysis on a parallel and distributed environment},
journal = {Parallel Computing},
year = 1999,
volume = 25,
number = 12,
month = NOV,
pages = {1413--1434},
abstract = {Industries in general and automotive industries in particular, use Finite Element Analysis (FEA) for better solutions to the engineering problems theyencounter. The reliability of the Finite Element method can be improved toa larger extent by Adaptive Finite Element Analysis (AFEA), As we look towards increasingly accurate solutions, the process becomes computationally intensive and requires parallel and economic high-performance scientific computing environments to solve them. In this paper we present a parallel implementation of AFEA on a cluster of workstations and illustrate its efficiency and scalability with examples. In this process, we have developed a user-friendly environment for Parallel Distributed computing which is portable on top of both Parallel Virtual Machine (PVM) and Message Passing Interface(MPI) message passing layers. We have addressed the issues of the several stages in AFEA from a parallel computing perspective that includes Domain decomposition, Parallel Mesh generation, Parallel Finite Element Analysis using a Substructuring technique and Load balancing.}
}


@Article{Nagar99:mpi-impl,
author = {S. Nagar and A. Banerjee and A. Sivasubramaniam and C. R. Das},
title = {Alternatives to coscheduling a network of workstations},
journal = {Journal of Parallel and Distributed Computing},
year = 1999,
volume = 59,
number = 2,
month = NOV,
pages = {302--327},
abstract = {Efficient scheduling of processes on processors of a Network of Workstations (NOW) is essential for good system performance. However, the design of such schedulers is challenging because of the complex interaction between several system and workload parameters. Coscheduling, though desirable, is impractical for such a loosely coupled environment. Two operations, waiting for a message and arrival of a message, can be used to take remedial actions that can guide the behavior of the system toward coscheduling using local information. We present a taxonomy of three possibilities for each of these two operations. leading to a design space of 3x3 scheduling mechanisms. This paper presents an extensive implementation and evaluation exercise in studying these mechanisms. Adhering to the philosophy that scheduling and communication are intertwined and should be studied in conjunction, a complete communication substrate for UltraSPARC workstations, connected by Myrinet and running Solaris 2.5.1, has been developed. This platform provides the entire Message Passing Interface (MPI) to readily run off-the-shelf MPI applications by employing protected low-latency user-level messaging. Several applications can concurrently use this interface. This platform has been usedto design. implement, and uniformly evaluate nine scheduling strategies with a mixture of concurrent real applications with varying communication intensities. This includes five new schemes (Periodic Boost, Periodic Boost with Spin Block, Spin Yield, Periodic Boost with Spin Yield, Dynamic Coscheduling with Spin Yield) that are presented in this paper. In addition to our evaluations of the pms and cons of each mechanism in terms of throughput, response time, CPU utilization, and Fairness, it is shown that Periodic Boost is a promising approach for scheduling processes on a NOW.}
}


@Article{Lappa99:mpi-app,
author = {M. Lappa and R. Savino},
title = {Parallel solution of three-dimensional {M}arangoni flow in liquid bridges},
journal = {International Journal for Numerical Methods in Fluids},
year = 1999,
volume = 31,
number = 6,
month = NOV,
pages = {911--935},
abstract = {This paper describes the implementation and performances of a parallel solver for the direct numerical simulation of the three-dimensional and time-dependent Navier-Stokes equations on distributed-memory, massively parallel computers. The feasibility of this approach to study Marangoni flow instability in half zone liquid bridges is examined. The results indicate that the incompressible, non-linear Navier-Stokes problem, governing the Marangoni flows behavior, can effectively be parallelized on a distributed memory parallel machine by remapping the distributed data structure. The numerical code is based on a three-dimensional Simplified Marker and Cell (SMAC) primitive variable method applied to a staggered finite difference grid. Using this method, the problem is split into two problems, one parabolic and the other elliptic A parallel algorithm, explicit in time, is utilized to solve the parabolic equations. A parallel multisplitting kernel is introduced for the solution of the pseudo pressure elliptic equation, representing the mosttime-consuming part of the algorithm. A grid-partition strategy is used inthe parallel implementations of both the parabolic equations and the multisplitting elliptic kernel. A Message Passing Interface (MPI) is coded for the boundary conditions; this protocol is portable to different systems supporting this interface for interprocessor communications. Numerical experiments illustrate good numerical properties and parallel efficiency. In particular, good scalability on a large number of processors can be achieved as long as the granularity of the parallel application is not too small. However, increasing the number of processors, the Speed-Up is ever smaller than the ideal linear Speed-Up. The communication timings indicate that complex practical calculations, such as the solutions of the Navier-Stokes equationsfor the numerical simulation of the instability of Marangoni flows, can beexpected to run on a massively parallel machine with good efficiency.}
}

@Article{hill99:mpi-app,
author = {R. W. Hill and K. S. Ball},
title = {Parallel implementation of a {F}ourier-{C}hebyshev collocation method for incompressible fluid flow and heat transfer},
journal = {Numerical Heat Transfer Part B},
year = 1999,
volume = 36,
number = 3,
month = {Oct-Nov},
pages = {309--329},
abstract = { A Fourier-Chebyshev collocation spectral method is parallelized to simulatethe three-dimensional unsteady flow and heat transfer inside a cylindricalenclosure. Two solution approaches using different techniques for determining the pressure field and enforcing mass conservation are presented for shared memory applications using Cray directives and for distributed memory applications using MPI and SHMEM message passing libraries. Matrix diagonalization is employed for solving the pressure Poisson equation and Helmholtz equations for the velocity components and temperature. The parallelization approach is described and scaling results are presented for both platform types.}
}


@Article{poggi:mpi-extension,
author = {A. Poggi and G. Destri},
title = {{MPOOL}: an object-oriented library for task composition and co-ordination},
journal = {Concurrency-Practice and Experience},
year = 1999,
volume = 11,
number = 14,
month = DEC,
pages = {835--848},
abstract = { MPOOL is an object-oriented extension to the MPI library, based on three categories of objects, called units, groups and schemes. Units are active objects composed of data (state) and procedures (like traditional passive objects), but with the additional ability to store incoming messages in a queuewhile they are active and to send messages in parallel to other units; moreover, different units may be active simultaneously. Groups and schemes arepassive objects used for the composition of units and the co-ordination oftheir actions, Groups manage collective communications and synchronizationoperations such as barriers. Schemes compose units' actions through the use of a set of constructs derived by path expressions.}
}

@Article{sel99:mpi-app,
author = {P. M. Selwood and M. Berzins},
title = {Parallel unstructured tetrahedral mesh adaptation: algorithms, implementation and scalability},
journal = {Concurrency-Practice and Experience},
year = 1999,
volume = 11,
number = 14,
month = DEC,
pages = {863--884},
abstract = { The use of unstructured adaptive tetrahedral meshes in the solution of transient flows poses a challenge for parallel computing due to the irregular and frequently changing nature of the data and its distribution. A parallel mesh adaptation algorithm, PTETRAD, for unstructured tetrahedral meshes (based on the serial code TETRAD) is described and analysed. The portable implementation of the parallel code in C with MPI is described and discussed, The scalability of the code is considered, analysed and illustrated by numerical experiments using a shock wave diffraction problem. }
}

@Article{meme:mpi-graphics-app,
author = {D. Meneveaux and K. Bouatouch},
title = {Synchronisation and load balancing for parallel hierarchical radiosity of complex scenes on a heterogeneous computer network},
journal = {Computer Graphics Forum},
year = 1999,
volume = 18,
number = 4,
month = DEC,
pages = {201--212},
abstract = {In this paper ae propose a SPMD parallel hierarchical radiosity algorithm relying on a novel partitioning method which may apply, to any kind of archilectural scene. This algorithm is based on MPI (Message Passing Interface),a communication library which allows the use of either a heterogeneous setof concurrent computers or a parallel computer or both. The database is stored on a common directory and accessed by all the processors (through NFS in case of a network of computers). As the objective is to handle complex scenes such as building interiors, to cope with the problem of memory size, only a subset of the database resides in memory of each processor. This subset is determined with the help of a partitioning into 3D cells, clusteringand visibility calculations. A graph expressing visibility between the resulting clusters is determined partitioned (with a new method based on classification of K-means type) and distributed amongst all the processors. Eachprocessor is responsible for gathering energy (using the Gauss-Seidel method) only for its subset of clusters. In order to reduce the disk transfers due to downloading these subsets of clusters, we use an ordering strategy based on the traveling salesman algorithm. Dynamic load balancing relies on a task stealing approach while termination is detected by configuring the processors into a ring and moving a token around this ring. The parallel iterative resolution is of group iterative type. Its mathematical convergence is proven in the appendix.}
}

@Article{bova2000:mpi-app,
author = {S. W. Bova and G. F. Carey},
title = {A distributed memory parallel element-by-element scheme for semiconductor device simulation},
journal = {Computer Methods in Applied Mechanics and Engineering},
year = 1999,
volume = 181,
number = 4,
pages = {403--423},
abstract = { A domain decomposition and parallel element-by-element (EBE) scheme is developed for semiconductor device simulation modeled by the drift-diffusion (DD) equations. A classical Gummel iterative decoupling of the potential and carrier transport equations is applied on an unstructured triangulation. The distributed memory EBE scheme is formulated for a Galerkin finite elementapproximation of the nonlinear Poisson problem, and a modified Scharfetter-Gummel method is used for the carrier transport problem. The resulting sequences of symmetric and nonsymmetric linear systems are solved via preconditioned Krylov methods. Unstructured triangular grids are used to permit grading of the mesh, which is then partitioned to processor subdomains with appropriate data structures for message passing. Details of the parallel algorithm and data structure are provided. The scheme is implemented in Fortran90 with MPI and performance results are presented for a representative MOSFET on an IBM SP, a CRAY T3E, and an SGI/CRAY Origin2000.}
}

@Article{bova2000:mpi-openmp-app,
author = {S. W. Bova and C. P. Breshears and C. E. Cuicchi and Z. Demirbilek and H. A. Gabb},
title = {Dual-level parallel analysis of harbor wave response using {MPI} and {OpenMP}},
journal = {International Journal of High Performance Computing Applications},
year = 2000,
volume = 14,
number = 1,
pages = {49--64},
abstract = {The authors describe their experiences converting an existing serial production code to a parallel code combining both MPI and OpenMP. Such dual-levelparallel codes will be able to take full advantage of the emerging class of high performance computer architectures using small clusters of shared-memory processors connected via a message-passing network. While the focus isrestricted to a harbor response simulation code, the techniques presented herein are appropriate for a broad class of applications that explore a parameter space. The code modifications reduced the execution time of one testcase from 3100 minutes on a single CPU to just over 12 minutes on 256 CPUs. Results demonstrate that dual-level parallelism allows substantial increases in model resolution combined with improvements in simulation turnaroundtime but, contrary to conventional wisdom, requires very little source code alteration.}
}

@Article{park99:mpi-app,
author = {N. Park and V. K. Prasanna and C. S. Raghavendra},
title = {Efficient algorithms for block-cyclic array redistribution between processor sets},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 1999,
volume = 10,
number = 12,
month = DEC,
pages = {1217--1240},
abstract = {Run-time array redistribution is necessary to enhance the performance of parallel programs on distributed memory supercomputers. In this paper, we present an efficient algorithm for array redistribution from cyclic(x) on P processors to cyclic(Kx) on Q processors. The algorithm reduces the overall time for communication by considering the data transfer, communication schedule, and index computation costs. The proposed algorithm is based on a generalized circulant matrix formalism. Our algorithm generates a schedule thatminimizes the number of communication steps and eliminates node contentionin each communication step. The network bandwidth is fully utilized by ensuring that equal-sized messages are transferred in each communication step.Furthermore, the time to compute the schedule and the index sets is significantly smaller. It takes O(maz(P, Q)) time and is less than 1 percent of the data transfer time. In comparison, the schedule computation time using the state-of-the-art scheme (which is based on the bipartite matching scheme) is 10 to 50 percent of the data transfer time for similar problem sizes. Therefore, our proposed algorithm is suitable for run-time array redistribution. To evaluate the performance of our scheme, we have implemented the algorithm using C and MPI on an IBM SP2. Results show that our algorithm performs better than the previous algorithms with respect to the total redistribution time, which includes the time for data transfer. schedule, and indexcomputation.}
}

@Article{dan00:mpi-app,
author = {K. T. Danielson and S. Hao and W. K. Liu and R. A. Uras and S. F. Li},
title = {Parallel computation of meshless methods for explicit dynamic analysis},
journal = {International Journal for Numerical Methods in Engineering},
year = 2000,
volume = 47,
number = 7,
month = MAR,
pages = {1323-1341},
abstract = {A parallel computational implementation of modern meshless methods is presented for explicit dynamic analysis. The procedures are demonstrated by application of the Reproducing Kernel Particle Method (RKPM). Aspects of a coarse grain parallel paradigm are detailed for a Lagrangian formulation using model partitioning. Integration points are uniquely defined on separate processors and particle definitions are duplicated, as necessary, so that all support particles for each point are defined locally on the corresponding processor. Several partitioning schemes are considered and a reduced graph-based procedure is presented. Partitioning issues are discussed and procedures to accommodate essential boundary conditions in parallel are presented. Explicit MPI message passing statements are used for all communications among partitions on different processors. The effectiveness of the procedure is demonstrated by highly deformable inelastic example problems.}
}

@Article{mar00:mpi-app,
author = {N. Marco and S. Lanteri},
title = {A two-level parallelization strategy for Genetic Algorithms applied to optimum shape design},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = 4,
month = MAR,
pages = {377--397},
abstract = {This pager presents a two-level strategy for the parallelization of a Genetic Algorithm (GA) coupled to a compressible flow solver designed on unstructured triangular meshes. The parallel implementation is based on MPI and makes use of the process group features of this environment. The resulting algorithm is used for the optimum shape design of aerodynamic configurations.Numerical and performance results are presented for the optimization of two-dimensional airfoils for calculations performed on the following systems:an SGI Origin 2000 and an IBM SP-2 MIMD systems; an Pentium Pro (P6/200 MHz) cluster where the interconnection is realized through a FastEthernet (100 Mbits/s) switch. }
}


@Article{An00:mpi-app,
author = {R. E. Ansorge and T. A. Carpenter and L. D. Hall and N. R. Shaw and G. B. Williams},
title = {Use of parallel supercomputing to design magnetic resonance systems},
journal = {IEEE Transactions on Applied Superconductivity},
year = 2000,
volume = 10,
number = 1,
month = MAR,
pages = {1368--1371},
abstract = {Historically analytical methods have been the preferred approach to designing magnets and gradient sets for magnetic resonance systems. Such methods are computationally efficient but are approximate, particularly away from the axis of symmetry. Alternative methods, which are much more computationally intensive, for example Genetic Algorithms, are now becoming practical, Such methods have the advantage that they can be used for unconventional designs and for the inclusion of nonanalytical design constraints such as real-word engineering and cost limitations. Gradient coil designs have been published previously [1]-[3]. Now with the availability of more powerful computers, more ambitious designs can be undertaken using parallel computing methods. The use of a Hitachi SR2201 supercomputer and clusters of Linux PCs (Beowulf) to develop a short whole body MRI magnet for clinical applications are reported on. An important feature of these computer codes is that they have been developed to run on parallel computing systems using the MPI message passing standard. MPI is an accepted industry standard, which means that these codes can readily be ported to different parallel computers. Previous success has been achieved in using MPI for a variety of other Medical Imaging problems [4].}
}

@InProceedings{cle95:mpi-debugging,
author = {C. Cl\'emen\,con and J. Fritscher and M. J. Meehan and R. R\"uhl},
title = {An Implementation of Race Detection and Deterministic Replay with {MPI}},
booktitle = {Proceedings of Euro-Par'95},
number = 966,
series = {LNCS},
year = 1995,
publisher = {Springer-Verlag},
month = AUG,
pages = {155-166},
meetingloc = {Stockholm, Sweden}
}


@Article{danad00:mpi-app,
author = {K. T. Danielson and M. D. Adley},
title = {A meshless treatment of three-dimensional penetrator targets for parallel computation},
journal = {Computational Mechanics},
year = 2000,
volume = 25,
number = 3,
month = MAR,
pages = {267--273},
abstract = {A meshless modeling procedure of three-dimensional targets for penetration analysis on parallel computing systems is described. Buried structures are modeled by arbitrary layers of concrete and geologic materials, and the projectile is modeled by standard finite elements. Penetration resistance of the buried structure is provided by functions derived from principles of dynamic cavity expansion. The resistance functions are influenced by the target material properties and projectile kinematics. Additional capabilities accommodate the varying structural and geometrical characteristics of the target. Coupling between the finite elements and the meshless target model is made by applying resistance loads to elements on the outer surface of the projectile mesh. Penetration experiments verify the approach. In this manner, the target is effectively modeled and the strategy is well suited for parallel processing. The procedure is incorporated into an explicit transient dynamics code, using mesh partitioning for a coarse grain parallel processing paradigm. Message Passing Interface (MPI) is used for all interprocessorcommunication. Large detailed finite element analyses of projectiles are performed on up to several hundred processors with excellent scalability. The efficiency of the strategy is demonstrated by analyses executed on several types of scalable computing platforms.}
}

@Article{kim00:mpi-app,
author = {S. Kim},
title = {Lattice {QCD} on a beowulf cluster},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 2000,
volume = 83,
number = 4,
month = APR,
pages = {807--809},
abstract = { Using commodity component personal computers based on Alpha processor and commodity network devices and a switch, we built an 8-node parallel computer. GNU/Linux is chosen as an operating system and message passing libraries Such as PVM, LAM, and MPICH have been tested as a parallel programming environment. We discuss our lattice QCD project for a heavy quark system on this computer.}
}

@Article{wat00:mpi-app,
author = {N. Watari and S. Ohnishi and H. Onishi and Y. Iwasawa},
title = {Total energy estimation for {Pd/Al} bimetallic surfaces by a parallel computation scheme},
journal = {Japanese Journal of Applied Physics Part 1---Regular Papers Short Notes \& Review Papers},
year = 2000,
volume = 39,
number = {3A},
month = MAR,
pages = {1457--1461},
Abstract = { A numerical calculation scheme for the multicenter problem in large molecules and clusters is presented by applying the message-passing inter-face (MPI) in a massively parallel computer that uses the density functional method. The multicenter problem associated with the Coulomb singularity of an atom is efficiently treated by the parallel processors by allocating several atoms into each processor element (PE). The order N-2/P tuning is obtained for the Coulomb energy calculation by using the MPI which transfers Coulomb potential field between PE's. This method is applied to estimate the total energy of the reconstructed Al/Pd bimetallic surface. The energy estimationby the charge density of a superposition of isolated atomic charge fragments predict a stabilization caused by the reconstruction, being consistent with a self-consistent-field (SCF) cluster calculation of the bimetallic surface.}
}

@Article{rod00:mpi-model,
author = {C. Rodriguez and J. L. Roda and F. Sande and D. G. Morales and F. Almeida},
title = {A new parallel model for the analysis of asynchronous algorithms},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = 6,
month = MAY,
pages = {753--767},
abstract = {The BSP model barrier synchronization imposes some limits both in the rangeof available algorithms and also in their performance. Although BSP programs can be translated to MPI/PVM programs, the counterpart is not true. The asynchronous nature of some MPI/PVM programs does not easily fit inside theBSP model. Through the suppression of barriers and the generalization of the concept of superstep we propose two new models, the BSP-like and the BSPwithout barriers (BSPWB) models. While the BSP-like extends the BSP* modelto programs written using collective operations, the more general BSPWB model admits the MPI/PVM parallel asynchronous programming style. The parameters of the models and their quality are evaluated on four standard parallelplatforms: the Cray T3E, the IBM SP2, the Origin 2000 and the Digital Alpha Server 8400. The study shows that the time spent in an h-relation is moreindependent on the number of processors than on the communication pattern.We illustrate the use of these BSP extensions through two problem-solving paradigms: the Nested Parallel Recursive Divide and Conquer Paradigm and the Virtual Pipeline Dynamic Programming Paradigm. The proposed paradigms explain how nested parallelism and processor virtualization can be introduced in MPI and PVM without having any negative impact in the performance and model accuracy. The prediction of the communication times is robust even for problems, where communication is dominated by small messages. }
}

@Article{Lie00:mpi-app,
author = {C. C. Liew and T. Ikeshoji and N. Saito and H. Inomata},
title = {Domain-shifting algorithm: A new domain-decomposition scheme for molecular dynamics simulations on parallel computers},
journal = {Progress of Theoretical Physics Supplement},
year = 2000,
number = 138,
pages = {205--210},
abstract = {A domain is conventionally defined as a stationary sub-region of the simulated system in a domain-decomposition scheme for molecular dynamics (MD) simulations on parallel computers. We proposed an algorithm where all domains pre-assigned to processors are shifted to a particular direction, beyond the displacement of particles in the system during a time-step or a period ofsmall time-steps; as a result, it allows us to reduce the data transfer partners in the particle re-allocation procedure. We also proposed a systematic link-cell method that allows us to make use of small domain and reduces the amount of data to be transferred for updating the positions and forces of particles, in comparison to the conventional schemes. Benchmark studies of a three-dimensional Lennard-Jones system have been carried out using a parallel MD simulation program implemented via a MPI-based message-passing interface on several parallel computers. A result on a 16-CPU parallel computer system shows that the new scheme allows us to achieve a high parallel efficiency (over 75\%) for MD simulations of a system with relatively small number of particles per processor (N/P $<$ 500).}
}


@Article{decyk00:mpi-app,
author = {V. K. Decyk and D. E. Dauger and P. R. Kokelaar},
title = {Plasma physics calculations on a parallel {M}acintosh cluster},
journal = {Physica Scripta},
year = 2000,
volume = {T84},
pages = {85--88},
abstract = {We have constructed a parallel cluster consisting of 16 Apple Macintosh G3 computers running the MacOS, and achieved very good performance on numerically intensive, parallel plasma particle-in-cell simulations. A subset of the MPI message-passing library was implemented in Fortran77 and C. This library enabled us to port code, without modification, from other parallel processors to the Macintosh cluster. For large problems where message packets are large and relatively few in number, performance of 50-150 MFlops/node ispossible, depending on the problem. This is fast enough that 3D calculations can be routinely done. Unlike Unix-based clusters, no special expertise in operating systems is required to build and run the cluster. Full detailsare available on our web site: http://exodus.physics.ucla.edu/ appleseed/.}
}

@Article{ma99:mpi-app,
author = {S. B. Ma},
title = {Comparisons of the parallel preconditioners on the {CRAY-T3E} for large nonsymmetric linear systems},
journal = {International Journal of High Speed Computing},
year = 1999,
volume = 10,
number = 3,
month = SEP,
pages = {285--300},
abstract = {In this paper we consider five types of parallel preconditioners for solving large sparse nonsymmetric linear systems on the CRAY-T3E. They are ILU(0)in the wavefront ordering, ILU(0) in the multi-coloring ordering, SSOR in the wavefront ordering, the SPAI(SParse Approximate Inverse) preconditioner, and finally Multi-color Block SOR preconditioner. The ILU(0) is known to be robust and the wavefront ordering naturally exploits the parallelism buthas a limited speedup due to the nonuniform lengths of the wavefronts. Multi-coloring is an efficient way of introducing the parallelism of order(N),where N is the order of the matrix but the convergence rate often deteriorates. The SPAI type preconditioner is inherently parallel and is gaining popularity. Finally, for the 5-point Laplacian matrix SOR method is known to have a nondeteriorating rate of convergence when the multi-coloring order is adopted. Also, Block SOR is expected to incur less communication overheads in a message-passing machine. Hence, Multi-Color Block SOR method is expected to have a good performance. Experiments were conducted for the Finite Difference discretizations of two problems with various meshsizes varying up to 1024 x 1024. MPI library was used for interprocess communications. Theresults show that ILU(0) in the multi-coloring ordering gives the best performance.}
}

@Article{pra00:mpi-sim,
author = {S. Prakash and E. Deelman and R. Bagrodia},
title = {Asynchronous parallel simulation of parallel programs},
journal = {IEEE Transactions on Software Engineering},
year = 2000,
volume = 26,
number = 5,
month = {MAY},
pages = {385--400},
abstract = {Parallel simulation of parallel programs for large datasets has been shown to offer significant reduction in the execution time of many discrete eventmodels. This paper describes the design and implementation of MPI-SIM, a library for the execution driven parallel simulation of task and data parallel programs. MPI-SIM can he used to predict the performance of existing programs written using MPI for message-passing, or written in UC, a data parallel language, compiled to use message-passing. The simulation models can beexecuted sequentially or in parallel. Parallel execution of the models aresynchronized using a set of asynchronous conservative protocols. This paper demonstrates how protocol performance is improved by the use of application-level, runtime analysis. The analysis targets the communication patternsof the application. We show the application-level analysis for message passing and data parallel languages. We present the validation and performanceresults for the simulator for a set of applications that include the NAS Parallel Benchmark suite. The application-level optimization described in this paper yielded significant performance improvements in the simulation of parallel programs, and in some cases completely eliminated the synchronizations in the parallel execution of the simulation model.}
}

@Article{gram00:mpi-alg,
author = {M. D. Grammatikakis and S. Liesche},
title = {Priority queues and sorting methods for parallel simulation},
journal = {IEEE Transactions on Software Engineering},
year = 2000,
volume = 5,
number = 26,
month = MAY,
pages = {401--422},
abstract = {We examine the design, implementation, and experimental analysis of parallel priority queues for device and network simulation. We consider: (1) distributed splay trees using MPI, (2) concurrent heaps using shared memory atomiclocks, and (3) a new, more general concurrent data structure based on distributed sorted lists, which is designed to provide dynamically balanced workallocation (with automatic or manual control) and efficient use of shared memory resources. We evaluate performance for all three data structures on a Cray-T3E900 system at KFA-Julich. Our comparisons are based on simulations of single buffers and a 64 x 64 packet switch which supports multicasting. In all implementations, PEs monitor traffic at their preassigned input/output ports, while priority queue elements are distributed across the Cray-T3E virtual shared memory. Our experiments with up to 60,000 packets and twoto 64 PEs indicate that concurrent priority queues perform much better than distributed ones. Both concurrent implementations have comparable performance, while our new data structure uses less memory and has been further optimized. We also consider parallel simulation for symmetric networks by sorting integer conflict functions and implementing an interesting packet indexing scheme. The optimized message passing network simulator can process similar to 500K packet moves in one second, with an efficiency that exceeds similar to 50 percent for a few thousands packets on the Cray-TBE with 32 PEs. All developed data structures now form a parallel library. Although our concurrent implementations use the Cray-T3E ShMem library, portability can be derived from Open-MP or MPI-2 standard libraries, which will provide support for one-way communication and shared memory lock mechanisms.}
}

@Article{bad00:mpi-app,
author = {S. B. Baden and S. J. Fink},
title = {A programming methodology for dual-tier multicomputers},
journal = {IEEE Transactions on Software Engineering},
year = 2000,
volume = 26,
number = 3,
month = MAR,
pages = {212--226},
abstract = {Hierarchically organized ensembles of shared memory multiprocessors possessa richer and more complex model of locality than previous generation multicomputers with single processor nodes. These dual-tier computers introduce many new factors into the programmer's performance model. We present a methodology for implementing block-structured numerical applications on dual-tier computers and a run-time infrastructure, called KeLP2, that implements the methodology. KeLP2 supports two levels of locality and parallelism via hierarchical SPMD control flow, run-time geometric meta-data, and asynchronous collective communication. KeLP applications can effectively overlap communication with computation under conditions where nonblocking point-to-point message passing fails to do so. KeLP's abstractions hide considerable detail without sacrificing performance and dual-tier applications written in KeLP consistently outperform equivalent single-tier implementations written in MPI. We describe the KeLP2 model and show how it facilitates the implementation of five block-structured applications specially formulated to hide communication latency on dual-tiered architectures. We support our arguments with empirical data from applications running on various single- and dual-tier multicomputers. KeLP2 supports a migration path from single-tier to dual-tier platforms and we illustrate this capability with a detailed programming example.}
}

@Article{gor00:mpi-theory,
author = {S. Gorlatch},
title = {Toward formally-based design of message passing programs},
journal = {IEEE Transactions on Software Engineering},
year = 2000,
volume = 26,
number = 3,
month = MAR,
pages = {276--288},
abstract = {We present a systematic approach to the development of message passing programs. Our programming model is SPMD, with communications restricted to collective operations: scan, reduction, gather, etc. The design process in suchan architecture-independent language is based on correctness-preserving transformation rules, provable in a formal functional framework. We develop aset of design rules for composition and decomposition. For example, scan followed by reduction is replaced by a single reduction, and global reduction is decomposed into two faster operations. The impact of the design rules on the target performance is estimated analytically and tested in machine experiments. As a case study, we design two provably correct, efficient programs using the Message Passing interface (MPI) for the famous maximum segment sum problem, starting from an intuitive, but inefficient, algorithm specification.}
}

@Article{hos00:mpi-app,
author = {A. Hossinger and E. Langer and S. Selberherr},
title = {Parallelization of a {M}onte {C}arlo ion implantation simulator},
journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
year = 2000,
volume = 19,
number = 5,
month = MAY,
pages = {560--567},
abstract = {We present a parallelization method based on message passing interface (MPI) for a Monte Carlo program for two-dimensional and three-dimensional (3-D)simulation of ion implantations. We use a master-slave strategy where the master process synchronizes the slaves and performs the input-output operations, while the slaves perform the physical simulation. For this method thesimulation domain is geometrically distributed among several CPU's which have to exchange only very little information during the simulation. Thereby, the communication overhead between the CPU's is kept so low that it has almost no influence on the performance gain even if a standard network of workstations is used instead of a massively parallel computer to perform the simulation. We have optimized the performance gain by identifying bottlenecks of this strategy when it is applied to arbitrary geometries consisting of various materials. This requires the application of different physical models within the simulation domain and makes it impossible to determine a reasonable domain distribution before starting the simulation. Due to a feedback between master and slaves by on-line performance measurements, we obtain an almost linear performance gain on a cluster of workstations with just slightly varying processor loads. Besides the increase in performance, the parallelization method also achieves a distribution of the required memory.This allows 3-D simulations on a cluster of workstations, where each single machines would not have enough memory to perform the simulation on its own.}
}


@Article{lee00:mpi-app,
author = {J. Y. Lee and J. Pillardy and C. Czaplewski and Y. Arnautova and D. R. Ripoll and A. Liwo and K. D. Gibson and R. J. Wawak and H. A. Scheraga},
title = {Efficient parallel algorithms in global optimization of potential energy functions for peptides, proteins, and crystals},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {399--411},
abstract = {Global optimization is playing an increasing role in physics, chemistry, and biophysical chemistry. One of the most important applications of global optimization is to find the global minima of the potential energy of molecules or molecular assemblies, such as crystals. The solution of this problem typically requires huge computational effort. Even the fastest processor available is not fast enough to carry out this kind of computation in real time for the problems of real interest, e.g., protein and crystal structure prediction. One way to circumvent this problem is to take advantage of massively parallel computing. In this paper, we provide several examples of parallel implementations of global optimization algorithms developed in our laboratory. All of these examples follow the master/worker approach. Most of the methods are parallelized on the algorithmic (coarse-grain) level and oneexample of fine-grain parallelism is given, in which the function evaluation itself is computationally expensive. All parallel algorithms were initially implemented on an IBM/SP2 (distributed-memory) machine. In all cases, however, message passing is handled through the standard Message Passing Interface (MPI); consequently the algorithms can also be implemented on any distributed- or shared-memory system that runs MPI. The efficiency of these implementations is discussed.}
}


@Article{Sri00:mpi-app,
author = {J. Srinivasan and Y. L. Volobuev and S. L. Mielke and D. G. Truhlar},
title = {Parallel {F}ourier Path-integral {M}onte {C}arlo calculations of absolute free energies and chemical equilibria},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {446--464},
abstract = {We present a parallel implementation of the Fourier Path Integral Monte Carlo method for calculating the absolute free energies of many-body systems. The implementation adopts the message-passing paradigm for parallelization.with the use of the Message Passing Interface (MPI) libraries. A portable computer program, written using Fortran 90. has been developed and tested on a variety of platforms such as the SGI Origin, the IBM SP. and the Cray T3D and T3E. We have used the program to demonstrate the efficacy of importance sampling in configuration space. We have also used die program to calculate the partition function. and hence the absolute free energies, of triatomic molecules and four-body systems.}
}

@Article{pra99:mpi-app,
author = {B. Prameela and L. M. Patnaik},
title = {Parallel implementation of alternate quadrant interlocking factorisation method on star topology},
journal = {International Journal of High Speed Computing},
year = 1999,
volume = 10,
number = 4,
month = DEC,
pages = {361--378},
abstract = {This paper discusses the parallel implementation of the solution of a set of linear equations using the Alternative Quadrant Interlocking Factorisation Methods (AQIF), on a star topology. Both the AQIF and LU decomposition methods are mapped onto star topology on an IBM SP2 system, with MPI as the internode communicator. Performance parameters such as speedup, efficiency have been obtained through experimental and theoretical means. The studies demonstrate (i) a mismatch of 15\% between the theoretical and experimental results, (ii) scalability of the AQIF algorithm, and (iii) faster executing AQIF algorithm.}
}


@Article{Roy00:mpi-app,
author = {S. Roy and R. Y. Jin and V. Chaudhary and W. L. Hase},
title = {Parallel molecular dynamics simulations of alkane/hydroxylated alpha-aluminum oxide interfaces},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {210--218},
abstract = {In this paper we describe a practical implementation of parallel computation for the molecular dynamics (MD) simulation of an alkane/aluminum oxide interface. A serial MD program was converted into a parallel code utilizing the message passing interface (MPI). This code was evaluated on a twelve processor symmetrical multiprocessor as well as on a cluster of four processorSMPs. A maximum speedup of 5.25 was achieved with twelve processors on thelarge shared memory machine. The cluster performance saturated at a speedup of 4.5 with two nodes, High communication costs and considerable load imbalance in the system were identified as areas that need further investigation for obtaining better performance.}
}


@Article{fur00:mpi-app,
author = {T. R. Furlani and J. Kong and P. M. W. Gill},
title = {Parallelization of {SCF} calculations within {Q-Chem}},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {170--177},
abstract = {We have incorporated MPI based parallelism with dynamic fond balance into the Hartree-Fock and DFT modules of Q-Chem. A series of benchmark calculations consisting of both single point energy and gradient calculations were carried out to gauge the performance of the parallel modules. Calculations were carried out on two different parallel computers, namely a shared memory Silicon Graphics Origin2000 and a distributed memory Cray T3E, to show the flexibility of the code and demonstrate the great utility of MPI. Scalability for the DFT and Hartree-Fock modules is demonstrated for up to 64 processors.} } Science B.V. All rights reserved.
@Article{Fle00:mpi-app,
author = {G. D. Fletcher and M. W. Schmidt and M. S. Gordon},
title = {The Distributed Data Interface in {GAMESS}},
journal = {Computer Physics Communications},
year = 2000,
volume = 128,
number = {1--2},
month = JUN,
pages = {190--200},
abstract = {The Distributed Data Interface to permit storage of large data arrays in the aggregate memory of distributed memory, message passing computer systems is described. The design of this relatively small library is discussed, in regard to its implementation over SHMEM, MPI-1. or socket based message libraries. The good performance of a MP2 program using DDI is demonstrated on both PC and workstation cluster computers, and some details of the resulting message traffic are presented.}
}


@Article{She00:mpi-app,
author = {A. I. Shestakov and M. K. Prasad and J. L. Milovich and N. A. Gentile and J. F. Painter and G. Furnish},
title = {The radiation-hydrodynamic {ICF3D} code},
journal = {Computer Methods in Applied Mechanics and Engineering},
year = 2000,
volume = 187,
number = {1--2},
pages = {181--200},
abstract = {We describe the 3D high temperature plasma simulation computer code ICF3D which is being developed at the Lawrence Livermore National Laboratory. The code is portable; it runs on a variety of platforms: uniprocessors, SMPs, and MPPs. It parallelizes by decomposing physical space into disjoint subdomains and relies on message passing libraries such as MPI. ICF3D is written in the object oriented programming language C++. The mesh is unstructured and consists of a collection of hexahedra, prisms, pyramids, and/or tetrahedra. The hydrodynamics is modeled by the discontinuous finite element methodwhich allows a natural representation of inherently discontinuous phenomena such as shocks. Continuous processes such as diffusion are modeled by conventional finite element methods. ICF3D is modular and consists of separateequation-of-state, hydrodynamic, heat conduction, and multi-group radiation transport (diffusion approximation) packages. We present results on problems relevant to Inertial Confinement Fusion which are obtained on a varietyof computers, uniprocessors and MPPs.}
}

% Thanks to Jesper Larsson Traeff of CCRL NEC for the following
%
% Design
%
@inproceedings{Hempel94,
author = {Hempel, Rolf},
title = "The {MPI} Standard for Message Passing",
booktitle = "High--Performance Computing and Networking, InternationalConference and Exhibition, Proceedings, Volume II: Networking and Tools",
editor = {Gentzsch, Wolfgang and Harms, Uwe},
publisher = sv,
series = lncs,
volume = 797,
pages = {247--252},
year = 1994
}

@inproceedings{Hempel94:uberblick,
author = "Hempel, Rolf",
title = "Der {M}essage {P}assing {I}nterface~--~{S}tandard: ein {{\"U}}berblick",
booktitle = "Praxisorientierte {P}arallelverarbeitung,{B}eitr{{\"a}}ge zum 3. {W}orkshop {{\"u}}ber {W}issenschaftliches Rechnen, {S}chwerpunkt {P}raxixorientierte {P}arallelverarbeitung",
editor = "Horst Langend{{\"o}}rfer",
publisher = "Carl {H}anser {V}erlag",
address = "Braunschweig, Germany",
year = 1994
}

@inproceedings{Hempel96,
author = {Rolf Hempel},
title = "The Status of the {MPI} Message-Passing Standard andIts Relation to {PVM}",
booktitle = "{P}arallel {V}irtual {M}achine -- {E}uro{PVM}'96",
editor = "Bode, Arndt and Dongarra, Jack and Ludwig, Thomas andSunderam, Vaidy",
publisher = sv,
series = lncs,
volume = 1156,
pages = {14--21},
year = 1996
}

@Article{HempelWalker99,
Author = {Rolf Hempel and David W. Walker},
Title = "The Emergence of the {MPI} Message Passing Standard forParallel Computing",
Journal = "{C}omputer {S}tandards \& {I}nterfaces",
Publisher = {Elsevier Science},
volume = 21,
year = 1999,
Pages = {51--62}
}

%Implementation
%==============
%
%SX
%--

@inproceedings{Hempel96:mpisx,
author = "Hempel, Rolf",
title = "The {MPI} Message--Passing Standard and its Implementationon the {NEC SX--4}",
booktitle = "Proceedings of the {NEC HPC} Workshop",
editor = "Doi, Shun",
address = "Tokyo, Japan",
year = "1996"
}

@inproceedings{HempelRitzdorfZimmermann97,
Author = {Rolf Hempel and Hubert Ritzdorf and Falk Zimmermann},
Title = "Implementation of {MPI} on {NEC}'s {SX-4} Multi-Node Architecture",
Booktitle = {Recent Advances in Parallel Virtual Machine andMessage Passing Interface. 4th European {PVM/MPI} Users' Group Meeting},
publisher = sv,
Series = lncs,
Volume = 1332,
Year = 1997,
Pages = {185--193},
}

@Article{HempelRitzdorfZimmermann98,
Author = {Rolf Hempel and Hubert Ritzdorf and Falk Zimmermann},
Title = "Efficient Message Passing Interface Implementations for{NEC} Parallel Computers",
Journal = {{NEC} Research \& Development},
Volume = 39,
Number = 4,
Year = 1998,
Pages = {408--413}
}

@inproceedings{TraffHempelRitzdorfZimmermann99,
Author = {Jesper Larsson Tr{\"{a}}ff andRolf Hempel and Hubert Ritzdorf and Falk Zimmermann},
Title = "Flattening on the fly: efficient handling of {MPI} derived datatypes",
Booktitle = {Recent Advances in Parallel Virtual Machine andMessage Passing Interface. 6th European {PVM/MPI} Users' Group Meeting},
publisher = sv,
Series = lncs,
Volume = 1697,
Year = 1999
}

%Cluster etc.
%------------

@inproceedings{GolebiewskiBaumHempel99,
Author = {\fontencoding{T1}\selectfont Maciej {Go\symbol{"AA}\symbol{"A6}biewski}and Markus Baum and Rolf Hempel},
Title = "High Performance Implementation of {MPI} for {Myrinet}",
Booktitle = {Parallel Computation. 4th International Conference of the {ACPC}},
publisher = sv,
Series = lncs,
Volume = 1557,
Year = 1999,
Pages = {510--521}
}

@inproceedings{GolebiewskiHempelTraff99,
Author = {\fontencoding{T1}\selectfont Maciej {Go\symbol{"AA}\symbol{"A6}biewski}and Rolf Hempel and Jesper Larsson Tr{\"{a}}ff},
Title = "Algorithms for collective communication operations on {SMP} clusters",
Booktitle = {The 1999 Workshop on Cluster-Based Computing held in conjunction with 13th {ACM-SIGARCH} International Conference on Supercomputing{(ICS'99)}},
Pages = {11--15},
Year = 1999
}

@inproceedings{BaumGolebiewskiHempelTraff99,
Author = {Markus Baum and\fontencoding{T1}\selectfont Maciej {Go\symbol{"AA}\symbol{"A6}biewski} and Rolf Hempel and Jesper Larsson Tr{\"{a}}ff},
Title = "Dual-device {MPI} Implementation for {PC} Clusters with {SMP} Nodes",
Booktitle = {{MPIDC'99} Message Passing Interface Developer's and User'sConference Journal of Papers and Presentations},
Pages = {53--60},
Year = 1999
}

@inproceedings{GolebiewskiBasermannBaumHempelRitzdorfTraff99,
Author = {\fontencoding{T1}\selectfont M. {Go\symbol{"AA}\symbol{"A6}biewski}and A. Basermann and M. Baum and R. Hempel and H. Ritzdorf and J. L. Tr{\"{a}}ff},
Title = "A {PC} Cluster with Application-Quality {MPI}",
Booktitle = {Euro-Par'99 Parallel Processing},
publisher = sv,
Series = lncs,
Volume = 1685,
Year = 1999,
Pages = {613--623},
}

%Tools
%=====

@inproceedings{HempelZimmermann96,
author = {R. Hempel and F. Zimmermann},
title = "On the automatic {PARMACS-to-MPI} transformation in application programs",
booktitle = "High-performance computing and networking:international conference and exhibition, {HPCN EUROPE} 1966, Brussels, Belgium, April 15--19, 1996: proceedings",
publisher = sv,
series = lncs,
volume = 1067,
year = 1996,
pages = {1033--1034}
}

@Article{HempelZimmermann99,
author = {Hempel, Rolf and Zimmermann, Falk},
title = "Automatic Migration from {PARMACS} to {MPI} in Parallel {F}ortran Applications",
journal = "{S}cientific {P}rogramming",
volume = 20,
number = 7,
year = 1999,
pages = {39--46}
}

@inproceedings{ReussnerTraffHunzelmann00,
Author = {Ralf Reussner and Jesper Larsson Tr{\"{a}}ff and Gunnar Hunzelmann},
Title = "A Benchmark for {MPI} Derived Datatypes",
Booktitle = {Recent Advances in Parallel Virtual Machine andMessage Passing Interface. 7th European {PVM/MPI} Users' Group Meeting},
Series = lncs,
Year = 2000,
Note = {To appear}
}

@inproceedings{FahringerGerndtRileyTraff00,
Author = {Thomas Fahringer and Michael Gerndt and Graham Riley andJesper Larsson Tr{\"{a}}ff},
Title = "Specification of Performance Problems in {MPI} Programs with {ASL}",
Booktitle = {International Conference in Parallel Processing {(ICPP'00)}},
Year = 2000,
Note = {To appear}
}

%Applications
%============

@inproceedings{Traff98,
Author = {Jesper Larsson Tr{\"{a}}ff},
Title = "Portable Randomized List Ranking on Multiprocessors using {{\sf MPI}}",
Booktitle = {Recent Advances in Parallel Virtual Machine andMessage Passing Interface. 5th European {PVM/MPI} Users' Group Meeting},
publisher = sv,
Series = lncs,
Volume = {1497},
Year = 1998,
Pages = {395--402}
}

%
% End of articles from NEC

@Article{bak00:mpi-app,
author = {J. Baker and M. Shirel},
title = {Ab initio quantum chemistry on PC-based parallel supercomputers},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = {7--8},
month = JUL,
pages = {1011--1024},
abstract = {The advent of mass-market personal computers (PC) and the associated price reduction in virtually all computer components has brought the cost of parallel, multi-processor computers down to highly affordable levels. Four-, eight-, and even 12-processor machines, constructed from basic, readily available PC components, can be obtained today for the same price as a good-quality single-processor workstation of a few years ago. Together with now well-established parallel tools (such as the message-passing interface (MPI) or parallel virtual machine (PVM) software), state-of-the-art, fully functioning, parallel machines using the Linux operating system and the latest PC microprocessors can deliver unprecedented price/performance ratios. This article reports on the capabilities and performance of a new, fully parallel ab initio program running on commercially available four- and eight-processor PC-based supercomputers.}
}

@Article{nob00:mpi-app,
author = {R. H. Nobes and A. P. Rendell and J. Nieplocha},
title = {Computational chemistry on {F}ujitsu vector-parallel processors: Hardware and programming environment},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = {7--8},
month = JUL,
pages = {869--886},
abstract = {In this and the following paper, we provide an introduction to the Fujitsu VPP range of vector-parallel supercomputers and to some of the computational chemistry software available for the VPP, Here, we consider the hardware and the design of software to exploit its capabilities. The VPP employs proprietary vector processors connected via a crossbar switch in a distributed-memory architecture. High single-node performance requires consideration of vector operand lengths, arithmetic pipe utilisation and memory-to-CPU bandwidth. Most parallel chemistry applications use either explicit 'message-passing' or a 'global-memory' paradigm, and benchmark results are presented for the communications performance of MPI, Linda and the Global Arrays.}
}


@Article{fru00:mpi-app,
author = {H. A. Fruchtl and R. H. Nobes and A. Bliznyuk},
title = {Performance of {MOPAC} on parallel computers},
journal = {Journal of Molecular Structure-Theochem},
year = 2000,
volume = 506,
number = {spec. SI},
month = JUL,
pages = {87--97},
abstract = {Key parts of the semiempirical MOPAC program package have been ported to parallel computers using the MPI message passing-library. Parallel routines are available for the calculation of vibrational frequencies and electrostatic potentials, as well as for energies of large biomolecules via the linear-scaling MOZYME self-consistent-held method. The parallelisation strategiesused are discussed, and performance measurements for benchmark calculations on three different parallel computers are presented. Frequency and ESP calculations show good scaling for up to eight nodes, independent of hardwareand communications software. MOZYME calculations scale reasonably well if a fast implementation of MPI is available.}
}

@Article{geo00:mpi-impl,
author = {W. L. George and J. G. Hagedorn and J. E. Devaney},
title = {{IMPI}: Making {MPI} interoperable},
journal = {Journal of Research of the National Institute of Standards and Technology},
year = 2000,
volume = 105,
number = 3,
pages = {343+},
month = {May-June},
abstract = {The Message Passing Interface (MPI) is the de facto standard for writing parallel scientific applications in the message passing programming paradigm.Implementations of MPI were not designed to interoperate, thereby limitingthe environments in which parallel jobs could be run. We briefly describe a set of protocols, designed by a steering committee of current implementors of MPI, that enable two or more implementations of MPI to interoperate within a single application. Specifically, we introduce the set of protocols collectively called Interoperable MPI (IMPI). These protocols make use of novel techniques to handle difficult requirements such as maintaining interoperability among all IMPI implementations while also allowing for the independent evolution of the collective communication algorithms used in IMPI. Our contribution to this effort has been as a facilitator for meetings, editor of the IMPI Specification document, and as an early testbed for implementations of IMPI. This testbed is in the form of an IMPI conformance tester,a system that can verify the correct operation of an IMPI-enabled version of MPI.}
}

@TechReport{kon00:mpi-measurement,
author = {Alice E. Koniges and Rolf Rabenseifner and Karl Solchenbach},
title = {Benchmark Design for Characterization of Balanced High-Performance Architectures},
institution = {},
year = 2000
}


@Article{kanTam:mpi-app,
author = {R. Kanapady and K. K. Tamma},
title = {A unified family of generalized integration operators [GInO] for non-linearstructural dynamics: implementation aspects},
journal = {Advances in Engineering Software},
year = 2000,
volume = 31,
number = {8--9},
pages = {639--647},
month = {Aug-Sep},
abstract = { The present paper proposes recent developments in theoretical and implementation aspects including parallel computations via a single analysis code ofa unified family of generalized integration operators [GInO] in time with particular emphasis on non-linear structural dynamics. The focus of this research is on the implementation aspects including the development of coarse-grained parallel computational models for such generalized time integration operators that he can readily ported to a wide range of parallel architectures via a message-passing paradigm (using MPI) and domain decomposition techniques. The implementation aspects are first described followed by an evaluation for a. range of problems which exhibit Large deformation, elastic,elastic-plastic dynamic behavior. For geometric non-linearity a total Lagrangian formulation and for material non linearity elasto-plastic formulations are employed. Serial and parallel performance issues on the SOI Origin 2000 system are discussed and analyzed for illustration for selected schemes. For illustration, particular forms of [GInO] are investigated and a complete development via a single analysis code is currently underway. Nevertheless, this is the first time that such a capability is plausible and the developments further enhance computational structural dynamics areas.}
}


@Article{Gur00:mpi-app,
author = {G. P. Guruswamy},
title = {{HiMAP}: a portable super modular multilevel parallel multidisciplinary process for large scale analysis},
journal = {Advances in Engineering Software},
year = 2000,
volume = 31,
number = {8--9},
pages = {617--620},
month = {Aug-Sep},
abstract = {An efficient super modular process to simulate aeroelasticity of aerospace vehicles using high fidelity flow equations such as the Euler/Navier-Stokesequations is presented. The process is suitable for both tightly coupled and uncoupled analysis. The process is designed to execute on massively parallel processors (MPP) and work-station clusters based on a multiple-instruction, multiple-data (MIMD) architecture. The fluids discipline is parallelized using a zonal approach whereas the structures discipline is parallelized using the substructures concept. provision is also made to include controls domain. Computations of each discipline are spread across processors using IEEE standard message passing interface (MPI) for inter processor communications. Disciplines can run in parallel using a macro utility MPIRUN developed based on MPI. In addition to discipline parallelization and coarse-grain parallelization of the disciplines, embarrassingly parallel capability to run multiple parameter cases is implemented using a script system. The combined effect of three levels of parallelization is an almost linear scalability for multiple concurrent analyses that pet-form efficiently on MPP.}
}

@Article{cfkl00:mpi-java,
author = {B. Carpenter and G. Fox and S. H. Ko and S. Lim},
title = {Object serialization for marshaling data in a {J}ava interface to {MPI}},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 7,
pages = {539--553},
month = JUN,
abstract = {Several Java bindings to Message Passing Interface (MPI) software have beendeveloped recently. Message buffers have usually been restricted to arrayswith elements of primitive type. We discuss adoption of the Java object serialization model for marshaling general communication data in MPI-like APIs, This approach is compared with a Java transcription of the standard MPI derived datatype mechanism. We describe an implementation of the mpiJava interface to MPI that incorporates automatic object serialization. Benchmark results confirm that current JDK implementations of serialization are not fast enough for high performance messaging applications. Means of solving this problem are discussed, and benchmarks for greatly improved schemes are presented. }
}

@Article{g-l00:mpi-app,
author = {A. J. Garcia-Loureiro and T. F. Pena and J. M. Lopez-Gonzalez and L. Prat},
title = {Parallel finite element method to solve the 3{D} {P}oisson equation and its application to abrupt heterojunction bipolar transistors},
journal = {International Journal for Numerical Methods in Engineering},
year = 2000,
volume = 49,
number = 5,
pages = {639--652},
month = OCT,
abstract = {In this work we present a parallel solver for the Poisson equation for 3D abrupt heterojunction bipolar transistors (HBT). Three-dimensional simulation is essential for studying devices of small geometry as in the case we have studied. We have used an unstructured tetrahedral mesh and we have applied the finite method clement (FEM), making a specific formulation for the nodes located on the interface of the regions with different characteristics.For WET devices, it is necessary to take into account that on both sides of the interface between the different regions exist materials with different properties. Our formulation implies situating pairs of nodes in the same physical positions of the interface, associating each nodes to a region of the HBT. This way, the effects due to thermionic emission and the tunnel effect may be simulated when the Poisson and the electron and hole equations are solved in an abrupt HBT. We have applied domain decomposition methods to solve the associate linear systems. This code has been implemented for distributed memory multicomputers, making use of a message passing standard library, MPI.}
}


@Article{sch00:mpi-app,
author = {W. Schneider and P. J. McCarthy and K. Lackner and O. Gruber and K. Behler and P. Martin and R. Merkel},
title = {{ASDEX} Upgrade {MHD} equilibria reconstruction on distributed workstations},
journal = {Fusion Engineering and Design},
year = 2000,
volume = 48,
number = {1--2},
pages = {127--134},
month = AUG,
abstract = {The identification of MHD equilibrium states on the ASDEX Upgrade tokamak is a prerequisite for interpreting measurements from a wide range of diagnostics which are correlated with the: shape of the plasma. The availability in realtime of plasma parameters related to the MHD state is crucial for controlling the experiment. Function Parameterization is used as a standard tool to determine the position, shape, and other global parameters of the plasma as well as the MHD equilibrium flux surfaces. The recently developed interpretive equilibrium code CLISTE now enables the calculation of MHD equilibria on an intershot timescale. These calculations are parallelized by the use of a Message Passing Interface (MPI).}
}

@Article{ave00:mpi-app,
author = {A. Averbuch and B. Epstein and L. Ioffe and I. Yavneh},
title = {Efficient parallelization of a three-dimensional {N}avier-{S}tokes solver on {MIMD} multiprocessors},
journal = {Journal of Supercomputing},
year = 2000,
volume = 17,
number = 2,
pages = {123--142},
month = SEP,
abstract = {The 3-D Navier-Stokes solver was implemented on three MIMD message-passing multiprocessors (a 64-processors IBM SP2, a 20-processors MOSIX, and a 64-processors Origin 2000). The same code written with PVM and MPI software packages was executed on all the above distinct computational platforms. The examples in the paper demonstrate that we can achieve efficiency of about 60\% for as many as 64 processors on Origin 2000 on a full-size 3-D aerodynamic problem which is solved on realistic computational grids.}
}


@Article{vNie00:rmi-grid,
author = {R. van Nieuwpoort and J.Maassen and H. E. Bal and T. Kielmann and R. Veldema},
title = {Wide-area parallel programming using the remote method invocation model},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 8,
pages = {643--666},
month = JUL,
annote = {Special Issue?},
abstract = {Java's support for parallel and distributed processing makes the language attractive for metacomputing applications, such as parallel applications that run on geographically distributed (wide-area) systems. To obtain actual experience with a Java-centric approach to metacomputing, we have built and used a highperformance wide-area Java system, called Manta, Manta implements the Java Remote Method Invocation (RMI) model using different communication protocols (active messages and TCP/IP) for different networks. The papershows how wide-area parallel applications can be expressed and optimized using Java RMI, Also, it presents performance results of several applications on a wide-area system consisting of four Myrinet-based clusters connected by ATM WANs, We finally discuss alternative programming models, namely object replication, JavaSpaces, and MPI for Java,}
}


@Article{pha00:mpi-app,
author = {S. Phadke and D. Bhardwaj and S. K. Dey},
title = {An explicit predictor-corrector solver with application to seismic wave modelling},
journal = {Computers \& Geosciences},
year = 2000,
volume = 26,
number = {9--10},
pages = {1053--1058},
month = {Nov.-Dec.},
abstract = {Wave-equation-based forward modelling using explicit finite-difference methods is a standard technique for calculating synthetic seismograms. The stability criterion restricts the size of the time step. In this paper a predictor-corrector method for solving the wave equation is described which allows the use of a larger time step. A stability analysis of the method is alsocarried out. Parallel implementation of the algorithm is described for a distributed computing environment which makes use of MPI and PVM message passing calls for communication between processors.}
}

@Article{oli00:mpi-app-compare,
author = {L, Oliker and R. Biswas},
title = {Parallelization of a dynamic unstructured algorithm using three leading programming paradigms},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2000,
volume = 11,
number = 9,
pages = {931--940},
month = SEP,
abstract = {The success of parallel computing in solving real-life computationally intensive problems relies on their efficient mapping and execution on large-scale multiprocessor architectures. Many important applications are both unstructured and dynamic in nature, making their efficient parallel implementation a daunting task. This paper presents the parallelization of a dynamic unstructured mesh adaptation algorithm using three popular programming paradigms on three leading supercomputers. We examine an MPI message-passing implementation on the Cray T3E and the SGI Origin2000, a shared-memory implementation using the cache coherent nonuniform memory access (CC-NUMA) feature of the Origin2000, and a multithreaded version on the newly released Tera Multithreaded Architecture (MTA). We compare several critical factors of this parallel code development, including runtime, scalability, programmability, portability, and memory overhead. Our overall results demonstrate that multithreaded systems offer tremendous potential for quickly and efficiently solving some of the most challenging real-life problems on parallel computers.}
}


@Article{pro00:mpi-impl,
author = {B. V. Protopopov and A. Skjellum},
title = {Shared-memory communication approaches for an {MPI} message-passing library},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 9,
pages = {799-820},
month = AUG,
abstract = {The contributions of this paper are three-fold. First, the authors present the taxonomy for shared-memory communication devices. Second, they show advantages and potential problems of the devices that belong to different classes of their taxonomy using the formulated design criteria. Third, they analyze communication performance of existing MPICH shared-memory devices, discuss optimizations of their performance, and show the performance gains that these optimizations yield. MPICH is used for comparison, since it is a widely used MPI implementation.}
}


@Article{dec00:mpi-app,
author = {T. Decker},
title = {Virtual data space - load balancing for irregular applications},
journal = {Parallel Computing},
year = 2000,
volume = 26,
number = {13--14},
pages = {1825--1860},
month = DEC,
abstract = {Load balancing is a key issue in the development of parallel algorithms with irregular structures. Existing load balancing systems each support only one specific programming paradigm and thus are of limited use. The system VDS presented here allows concurrent use of various paradigms such as fork-join, weighted tasks, and static dags (directed acyclic graphs that are knownin advance). The system provides visual performance evaluation tools to facilitate the efficient application of the system. VDS supports various communication interfaces including PVM and MPI. Thus, VDS-applications can be run on architectures ranging from workstation clusters to massively parallelsystems.}
}

@Article{duan00:mpi-app,
author = {S. Duan and K. S. Anderson},
title = {Parallel implementation of a low order algorithm for dynamics of multibody systems on a distributed memory computing system},
journal = {Engineering with Computers},
year = 2000,
volume = 16,
number = 2,
pages = {96--108},
abstract = {In this paper, a new hybrid parallelisable low order algorithm, developed by the authors for multibody dynamics analysis, is implemented numerically on a distributed memory parallel computing system. The presented implementation can currently accommodate the general spatial motion of chain systems, but key issues for its extension to general tree and closed loop systems are discussed. Explicit algebraic constraints are used to increase coarse grain parallelism, and to study the influence of the dimension of system constraint load equations on the computational efficiency of the algorithm for real parallel implementation using the Message Passing Interface (MPI). The equation formulation parallelism and linear system solution strategies which are used to reduce communication overhead are addressed. Numerical results indicate that the algorithm is scalable, that significant speed-up can beobtained, and that a quasi-logarithmic relation exists between time neededfor a function call and numbers of processors used. This result agrees well with theoretical performance predictions. Numerical comparisons with results obtained from independently developed analysis codes have validated thecorrectness of the new hybrid parallelisable low order algorithm, and demonstrated certain computational advantages.}
}

@Article{nam00:mpi-app,
author = {A. Namazifard and I. D. Parsons},
title = {An {MPI} parallel implementation of {N}ewmark's method},
journal = {Computer-Aided Civil and Infrastructure Engineering},
year = 2000,
volume = 15,
number = 3,
pages = {189--195},
month = MAY,
abstract = {The standard message-passing interface (MPI) is used to parallelize Newmark's method. The linear matrix equation encountered at each time step is solved using a preconditioned conjugate gradient algorithm. Data are distributed over the processors of a given parallel computer on a degree-of-freedom basis; this produces effective load balance between the processors and leads to a highly parallelized code. The portability of the implementation of this scheme is tested by solving some simple problems on two different machines: an SGI Origin2000 and an IBM SP2. The measured times demonstrate the efficiency of the approach and highlight the maintenance advantages that arise from using a standard parallel library such as MPI.}
}

@Article{chp00:prgm-devlp,
author = {B. Chapman and J. Merlin and D. Pritchard and F. Bodin and Y. Mevel and T. Sorevik and L. Hill},
title = {Program development tools for clusters of shared memory multiprocessors},
journal = {Journal of Supercomputing},
year = 2000,
volume = 17,
number = 3,
pages = {311--322},
month = NOV,
abstract = {Applications are increasingly being executed on computational systems that have hierarchical parallelism. There are several programming paradigms which may be used to adapt a program for execution in such an environment. In this paper, we outline some of the challenges in porting codes to such systems, and describe a programming environment that we are creating to support the migration of sequential and MPI code to a cluster of shared memory parallel systems, where the target program may include MPI, OpenMP or both. As part of this effort, we are evaluating several experimental approaches to aiding in this complex application development task.}
}

@Article{getov00:mpi-java,
author = {V. S. Getov and P. A. Gray and V. S. Sunderam},
title = {Aspects of portability and distributed execution for {JNI}-wrapped message passing libraries},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1039--1050},
month = SEP,
abstract = {This paper discusses an approach which aims to provide legacy message passing libraries with Java-like portability in a heterogeneous, metacomputing environment, The results of such portability permit distributed computing components to be 'soft loaded' or 'soft-installed' in a dynamic fashion, ontocooperating resources for concurrent, synchronized parallel execution. This capability provides researchers with the ability to tap into a much larger resource pool and to utilize highly tuned codes for achieving performance, Necessarily, the Java programming language is a significant component. The Java Native Interface (JNI) is used to wrap message passing libraries written in other languages, and the bytecode which is generated for the front-end may be analyzed in order to completely determine the needs of the code which it wraps, This characterization allows the pre-configuration of a remote environment so as to be able to support execution. The usefulness of the portability gained by our approach is illustrated through examples showing the soft-installation of a process using an MPI computational substrate and the soft-installation of a process which requires a C-based communication library based upon the efficient multi-cast communication package, CCTL, The examples show that significant gains in performance can be achieved while allowing message passing execution to still, exhibit high levels of portability.}
}

@Article{smith00:mpi-openmp,
author = {L. Smith and P. Kent},
title = {Development and performance of a mixed {OpenMP/MPI} quantum {M}onte {C}arlo code},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 12,
pages = {1121--1129},
month = OCT,
abstract = {The code has been rewritten to allow for an arbitrary mix of OpenMP and MPIparallelism. The various issues which arose during the parallelization arediscussed. The performance of the mixed OpenMP/MPI code has been assessed on an SGI Origin 2000 system and the results compared and contrasted to theoriginal MPI version.}
}

@Article{hotta00:mpi-app,
author = {A. Hotta and H. Ninokata and A. J. Baratta},
title = {Development of parallel coupling system between three-dimensional nodal kinetic code {ENTREE} and two-fluid plant simulator {TRAC/BF1}},
journal = {Journal of Nuclear Science and Technology},
year = 2000,
volume = 37,
number = 10,
pages = {840--854},
month = OCT,
abstract = {The high-speed three-dimensional neutron kinetic code ENTREE: was developedbased on the polynomial and semi-analytical nonlinear iterative nodal methods (PNLM and SANLM) with also introducing the discontinuity factor. In order to enhance the efficiency of transient calculation, the nonlinear correction-coupling coefficients are intermittently updated based on the changingrate of core state variables. By giving the analytical form for two-node problem matrix elements, the additional computing time in SANLM was minimized. A fast algorithm was developed for the multi table macro-cross section rebuilding process. The reactivity component model was implemented based on the variation of the neutron production and destruction terms. The code wascoupled with the two-fluid thermal hydraulic plant simulator TRAC/BF1 through PVM or MPI protocols. Two codes are executed in parallel with exchanging the feedback parameters explicitly. Based on the LMW PWR transient benchmark, it was shown that bath PNLM and SANLM spend less than 20\% excess computing time in comparison with the coarse mesh finite difference method (CFDM). The implementation of the discontinuity factor was verified based on theDVP problem. Adequacy and parallel efficiency of the coupling system TRAC/BF1-ENTREE was demonstrated based on the BWR cold water injection transientproposed by NEA/CRP.}
}

@Article{silva00:mpi-java,
author = {L. M. Silva and P. Martins and J. G. Silva},
title = {Heterogeneous parallel computing using {Java} and {WMPI}},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1077-1091},
month = SEP,
abstract = {In this paper, we present briefly the implementation of a Java interface for WMPI, a Windows-based implementation of MPI, Then, we describe a system that is oriented for Web-based computing and present a solution to integrateWMPI with this tool by making use of a Java bridge component and the Java bindings for WMPI, This solution allows the execution of meta-applications over a mixed configuration of platforms, execution models and programming languages. The resulting system provides a way to solve the problem of heterogeneity and to unleash the potential of diverse computational resources and programming tools.}
}


@Article{thir00:mpi-impl,
author = {G. K. Thiruvathukal and P. M. Dickens and S. Bhatti},
title = {Java on networks of workstations ({JavaNOW}): a parallel computing framework inspired by {Linda} and the {M}essage {P}assing {I}nterface ({MPI})},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1093--1116},
month = SEP
}

@Article{thir00:mpi-java,
author = {G. K. Thiruvathukal and P. M. Dickens and S. Bhatti},
title = {Java on networks of workstations ({JavaNOW}): a parallel computing framework inspired by {Linda} and the {M}essage {P}assing {I}nterface ({MPI})},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1093--1116},
month = SEP,
Abstract = {JavaNOW provides a simple yet powerful framework for performing computationon networks of workstations. In addition to the Linda memory model, it provides for shared objects, implicit multithreading, implicit synchronization, object dataflow, and collective communications similar to those defined in MPI. JavaNOW is also a component of the Computational Neighborhood, a Java enabled suite of services for desktop computational sharing. The intent of JavaNOW is to present an environment for parallel computing that is both expressive and reliable and ultimately can deliver good to excellent performance. As JavaNOW is a work in progress, this article emphasizes the expressive potential of the JavaNOW environment and presents preliminary performance results only.}
}

@Article{carp00:mpi-java,
author = {B. Carpenter and V. Getov and G. Judd and A. Skjellum and G. Fox},
title = {{MPJ: MPI}-like message passing for {Java}},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 11,
pages = {1019--1038},
month = SEP,
abstract = {Recently, there has been a lot of interest in using Java for parallel programming. Efforts have been hindered by lack of standard Java parallel programming APIs, To alleviate this problem, various groups started projects to develop Java message passing systems modelled on the successful Message Passing Interface (MPI), Official MPI bindings are currently defined only for C, Fortran, and C++, so early MPI-Iike environments for Java have been divergent. This paper relates an effort undertaken by a working group of the Java Grande Forum, seeking a consensus on an MPI-like API, to enhance the viability of parallel programming using Java.}
}


@Article{wall00:mpi-openmp,
author = {A. J. Wallcraft},
title = {{SPMD OpenMP} versus {MPI} for ocean models},
journal = {Concurrency-Practice and Experience},
year = 2000,
volume = 12,
number = 12,
pages = {1155-1164},
month = OCT,
Abstract = {OpenMP can be used in Single Program Multiple Data (SPMD) mode by spawning N threads in the main program and having each thread act from then on similarly to a process in MPI. The initial port of one ocean model to SPMD OpenMP revealed several incompatibilities between thread-based and process-basedSPMD coding styles. Adding support for threaded I/O was particularly painful, requiring modification to hundreds of lines of code. Several relativelyminor additions to the OpenMP API were identified that would greatly simplify SMPD programming. Meanwhile, an alternative Fortran compiler-based SPMDAPI, Go-Array Fortran, became available on the Cray T3E, There is a simplemapping from SHMEM put/get library calls onto co-array assignment statements, so adding Go-Array Fortran support to the ocean models was straightforward, To extend Go-Array Fortran to machines other than the Cray T3E, a subset of the language is automatically translated into SPMD OpenMP via a nawk script. The performance of the 'native' OpenMP and translated Go-Array Fortran versions of the ocean model was virtually identical, so the former has been replaced by the latter (which is much easier to maintain)}
}

@Article{qia00:mpi-app,
author = {J. Qiang and R. D. Ryne and S. Habib},
title = {Fortran implementation of object-oriented design in parallel beam dynamics simulations},
journal = {Computer Physics Communications},
year = 2000,
volume = 133,
number = 1,
pages = {18--33},
month = DEC,
abstract = {In this paper, an object-oriented design for parallel beam transport simulations in accelerators is implemented using Fortran 30 (F90) with Message Passing interface (MPI) and High Performance Fortran (HPF). This improves themaintainability, reusability. and extensibility of software, combined withthe high performance of using MPI and the ease of parallel programming provided by HPF. The overhead associated with the object-oriented implementation has only a minor effect on performance.}
}


@Article{hu00:openmp,
author = {Y. C. Hu and H. H. Lu and A. L. Cox and W. .Zwaenepoel},
title = {{OpenMP} for networks of {SMP}s},
journal = {Journal of Parallel and Distributed Computing},
year = 2000,
volume = 60,
number = 12,
pages = {1512--1530},
month = DEC,
abstract = {We present performance results for seven applications (Barnes-Hut, CLU, andWater from SPLASH-2, 3D-FFT from NAS, Red-Black SOR, TSP, and MGS) runningon an SP2 with four four-processor SMP nodes. A comparison between the thread implementation and the original implementation of TreadMarks shows thatusing the hardware shared memory within an SMP node significantly reduces the amount of data and the number of messages transmitted between nodes andconsequently achieves speedups that are up to 30\% better than the originalversions. We also compare SDSM against message passing. Overall, the speedups or multithreaded TreadMarks programs are within 7-30\% of the MPI versions.}
}

@Article{kry01:mpi-app,
author = {P. Krysl and Z. Bittnar},
title = {Parallel explicit finite element solid dynamics with domain decomposition and message passing: dual partitioning scalability},
journal = {Computers and Structures},
year = 2001,
volume = 79,
number = 3,
pages = {345--360},
month = JAN,
Abstract = {We document not only the high-level algorithms but also the relevant communication code fragments of the message passing implementation using the MPI library, so as to empower the reader to fully verify our numerical experiments.}
}

@Article{leg00:mpi-applibs,
author = {P. F. Leggett and S. P. Johnson and M. Cross},
title = {{CAPLib} - a `thin layer' message passing library to support computational mechanics codes on distributed memory parallel systems},
journal = {Advances in Engineering Software},
year = 2000,
volume = 32,
number = 1,
pages = {61--83},
month = DEC
}

@Article{sad01:mpi-app,
author = {M. Sadeghi and F. Liu},
title = {Computation of mistuning effects on cascade flutter},
journal = {AIAA Journal},
year = 2001,
volume = 39,
number = 1,
pages = {22--28},
month = JAN,
Abstract = {A computational method is described for predicting Butter of turbomachinerycascades with mistuned blades. The method solves the unsteady Euler/Navier-Stokes equations for multiple-blade passages on a parallel computer using the message passing interface. A secund-order implicit scheme with dual time-stepping and multigrid is used. Each individual blade is capable of moving with its own independent frequeucy and phase angle, thus modeling a cascade with mistuned blades. Flutter predictions are performed through the energy method, Both phase-angle and frequency mistuning are studied, It is found that phase-angle mistuning has little effect on stability, whereas frequency mistuning significantly changes the aerodynamic damping, The important effect of frequency mistuning is to average out the aerodynamic damping of the tuned blade row over the whole range of interblade phase angles (IBPA).If a tuned blade row is stable over most of the IBPA range, the blades canbe stabilized fur the complete IBPA range through appropriate frequency mistuning.}
}


@Article{gull01:mpi-app,
author = {A. S. Gullerud and R. H. Dodds},
title = {{MPI}-based implementation of a {PCG} solver using an {EBE} architecture and preconditioner for implicit, 3-{D} finite element analysis},
journal = {Computers and Structures},
year = 2001,
volume = 79,
number = 5,
pages = {553--575},
month = FEB,
Abstract = {This work describes a coarse-grain parallel implementation of a linear preconditioned conjugate gradient solver using an element-by-element architecture and preconditioner for computation. The solver, implemented within a nonlinear. implicit finite element code, uses an MPI-based message-passing approach to provide portable parallel execution on shared, distributed, and distributed-shared memory computers. The flexibility of the element-by-element approach permits a dual-level mesh decomposition; a coarse, domain-level decomposition creates a load-balanced domain for each processor for parallel computation, while a second level decomposition breaks each domain into blocks of similar elements (same constitutive model- order of integration, element type) for fine-grained parallel computation on each processor. The key contribution here is a new parallel implementation of the Hughes-Winget (HW) element-by-element preconditioner suitable for arbitrary, unstructuredmeshes. The implementation couples an unstructured dependency graph with anew balanced graph-coloring algorithm to schedule parallel computations within and across domains. The code also includes the diagonal preconditionerand a modern parallel (threaded) sparse direct solver for comparison, Three example problems with up to 158,000 elements and 180,000 nodes analyzed on an SGI/Cray Origin 2000 illustrate the parallel performance of the algorithms and preconditioners, Analyses with varying block sizes illustrate thatthe two-level decomposition improves overall execution speed with the block size tuned for the cache memory architecture of the executing platform. This implementation of the HW preconditioner shows reasonable parallel efficiency - typically 80\%, on 48 processors. Efficiency for the diagonal preconditioner is also high, with total speedups reaching 86\% on 48 CPUs. Calculation of the tangent element stiffnesses shows superlinear speedups for each of the test problems, while the computation of strains/stresses/residual forces shows 80\% parallel efficiency on 48 processors.}
}



@Article{scot01:mpi-app,
author = {J. A. Scott},
title = {A parallel frontal solver for finite element applications},
journal = {International Journal for Numerical Methods in Engineering},
year = 2001,
volume = 50,
number = 5,
pages = {1131--1144},
month = FEB,
Abstract = {In finite element simulations, the overall computing time is dominated by the time needed to solve large sparse linear systems of equations. We reporton the design and development of a parallel frontal code that can significantly reduce the wallclock time needed for the solution of these systems. The algorithm used is based on dividing the finite element domain into subdomains and applying the frontal method to each subdomain in parallel. The so-called multiple front approach is shown to reduce the amount of work and memory required compared with the frontal method and, when run on a small number of processes, achieves good speedups. The code, HSL_MP42, has been developed for the Harwell Subroutine Library (http://www.numerical.rl.ac.uk/hsl). It is written in Fotran 90 and, by using MPI for message passing, achieves portability across a wide range of modem computer architectures.}
}

@Article{alta01:mpi-eval,
author = {K. Al-Tawil and C. A. Moritz},
title = {Performance modeling and evaluation of {MPI}},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 2,
pages = {202--223},
abstract = {Users of parallel machines need to have a good grasp for how different communication patterns and styles affect the performance of message-passing applications. LogGP is a simple performance model that reflects the most important parameters required to estimate the communication performance of parallel computers. The message passing interface (MPI) standard provides new opportunities for developing high performance parallel and distributed applications. In this paper, we use LogGP as a conceptual framework for evaluating the performance of MPI communications on three platforms: Gray-Research T3D, Convex Exemplar 1600SP, and a network of workstations (NOW). We developa simple set of communication benchmarks to extract the LogGP parameters. Our objective in this is to compare the performance of MPI communication onseveral platforms and to identify a performance model suitable for MPI performance characterization. In particular, two problems are addressed: how LogGP quantifies MPI performance and what extra features are required for modeling MPI, and how MPI performance compare on the three computing platforms: Gray Research T3D, Convex Exemplar 1600SP, and workstations clusters.}
}

@Article{grif00:mpi-app,
author = {L. W. Griffin and D. J. Dorney},
title = {Simulations of the unsteady flow through the Fastrac supersonic turbine},
journal = {Journal of Turbomachinery-Transactions of the ASME},
year = 2000,
volume = 122,
number = 2,
pages = {225--233},
month = APR,
abstract = {Analysis of the unsteady aerodynamic environment in the Fastrac supersonic turbine is presented. Model analysis of the turbine blades indicated possible resonance in crucial operating ranges of the turbopump. Unsteady computational fluid dynamics (CFD) analysis was conducted to support the aerodynamic and structural dynamic assessments of the turbine. Before beginning the analysis, two major problems with current unsteady analytical capabilities had to be addressed: modeling a straight centerline nozzle with the turbineblades and exit guide vanes (EGVs), and reducing run times significantly while maintaining physical accuracy. Modifications were made to the CFD codeused in this study to allow the coupled nozzle/blade/EGV analysis and to incorporate Message Passing Interface (MPI) software. Because unsteadiness is a key issue for the Fastrac turbine [and future rocket engine turbines such as the Reusable Launch Vehicle (RLV)], calculations were performed for two nozzle-to-blade axial gaps. Calculations were also performed for the nozzle alone, and the results were imposed as an inlet boundary condition for a blade/EGV calculation for the large gap case. These results are compared to the nozzle/blade/EGV results.}
}

@Article{des01:mpi-app,
author = {J. C. Desplat and I. Pagonabarraga and P. Bladon},
title = {{LUDWIG: A} parallel {L}attice-{B}oltzmann code for complex fluids},
journal = {Computer Physics Communications},
year = 2001,
volume = 134,
number = 3,
pages = {273--290},
month = MAR,
Abstract = {This paper describes Ludwig, a versatile code for the simulation of Lattice-Boltzmann (LB) models in 3D on cubic lattices. In fact, Ludwig is not a single code, but a set of codes that share certain common routines, such as I/O and communications. If Ludwig is used as intended, a variety of complex fluid models with different equilibrium free energies are simple to code, so that the user may concentrate on the physics of the problem, rather than on parallel computing issues. Thus far, Ludwig's main application has been to symmetric binary fluid mixtures. We first explain the philosophy and structure of Ludwig which is argued to be a very effective way of developing large codes for academic consortia. Next we elaborate on some parallel implementation issues such as parallel I/O, and the use of MPI to achieve full portability and good efficiency on both MPP and SMP systems. Finally, we describe how to implement generic solid boundaries, and look in detail at the particular case of a symmetric binary fluid mixture near a solid wall. We present a novel scheme for the thermodynamically consistent simulation of wetting phenomena, in the presence of static and moving solid boundaries, andcheck its performance.}
}

@Article{tan00:mpi-impl,
author = {H. Tang and K. Shen and T. Yang},
title = {Program transformation and runtime support for threaded {MPI} execution on shared-memory machines},
journal = {ACM Transactions on Programming Languages and Systems},
year = 2000,
volume = 22,
number = 4,
pages = {673--700},
month = JUL,
Abstract = {Parallel programs written in MPI have been widely used for developing high-performance applications on various platforms. Because of a restriction of the MPI computation model, conventional MPI implementations on shared-memory machines map each MPI, node to an OS process, which can suffer serious performance degradation in the presence of multiprogramming, This paper studies compile-time and runtime techniques for enhancing performance portability of MPI code running on multiprogrammed shared-memory machines. The proposed techniques allow MPI nodes to be executed safely and efficiently as threads. Compile-time transformation eliminates global and static variables in C code using node-specific data. The runtime support includes an efficient and provably correct communication protocol that uses lock-free data structure and takes advantage of address space sharing among threads. The experiments on SGI Origin 2000 show that our MPI prototype called TMPI using the proposed techniques is competitive with SGI's native MPI implementation in adedicated environment, and that it has significant performance advantages in a multiprogrammed environment.}
}

@Article{dim01:mpi-app,
author = {I. Dimov and V. Alexandrov and A. Karaivanova},
title = {Parallel resolvent Monte Carlo algorithms for linear algebra problems},
journal = {Mathematics and Computers in Simulation},
year = 2001,
volume = 55,
number = {1-3},
pages = {25--35},
month = FEB,
abstract = {In this paper, we consider Monte Carlo (MC) algorithms based on the use of the resolvent matrix for solving linear algebraic problems. Estimates for the speedup and efficiency of the algorithms are presented. Some numerical examples performed on cluster of workstations using MPI are given. }
}

@Article{luCai01:mpi-app,
author = {Q. M. Lu and D. S. Cai},
title = {Implementation of parallel plasma particle-in-cell codes on {PC} cluster},
journal = {Computer Physics Communications},
year = 2001,
volume = 135,
number = 1,
pages = {93--104},
month = MAR,
Abstract = {Plasma particle-in-cell (PIC) codes model the interaction of charged particles with the surrounding fields, and they have been implemented on many advanced parallel computers. Recently, many PC clusters which consist of inexpensive PCs have been developed to do parallel computing, and we also build such a PC cluster. In this paper, we present the implementation of a parallel plasma PIC code on our PC cluster using MPI, PGHPF and JavaMPI.}
}

@Article{yas01:mpi-app,
author = {O. Yasar},
title = {A new ignition model for spark-ignited engine simulations},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = {1--2},
pages = {179--200},
month = JAN,
abstract = {The amount of spark energy deposited into the combustion chamber is key to an optimum ignition as one can end up with misfires when this energy is lowor with other undesired effects on engine performance and byproducts when it is high, Experimentally, up to now, no one has been able to correlate the combustion outcome accurately to the spark parameters in a controllable way. Theoretical investigation and computer modeling is leading to a better understanding of how spark flames propagate. A new computational approach to ignition dynamics is presented here for spark-ignited (SI) engine combustion simulations. Our computational model, using the MPI communication library, attempts to solve temporal and spatial equations of the electromagnetic(EM) equations in conjunction with the well-known Navier-Stokes equations of the standard KIVA-3 engine code. The interaction between the gas and theflame (plasma) kernel in the spark region is computed through the momentumand energy exchange between these two fields, Preliminary results show a distinct spatial distribution of physical quantities at the flame front and within the inflammation zone. A slight change in the spark discharge current has significant impact on the combustion and emissions. Enhanced accuracyof spark ignition modeling might help us better compute the early flame propagation and its influence on the cyclic variability of engines, potentially leading to design of new spark plugs. }
}


@Article{lin01:mpi-graphics,
author = {W. S. Lin and R. W. H. Lau and K. Hwang and X. L. Lin and P. Y. S. Cheung},
title = {Adaptive parallel rendering on multiprocessors and workstation clusters},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2001,
volume = 12,
number = 3,
pages = {241--258},
month = MAR,
abstract = {This paper presents the design and performance of a new parallel graphics renderer for 3D images. This renderer is based on an adaptive supersampling approach that works for time/space-efficient execution on two classes of parallel computers. Our rendering scheme takes subpixel supersamples only along polygon edges. This leads to a significant reduction in rendering time and in buffer memory requirements. Furthermore, we offer a balanced rasterization of all transformed polygons. Experimental results prove these advantages on both a shared-memory SGI multiprocessor server and a Unix cluster ofSun workstations. We reveal performance effects of the new rendering scheme on subpixel resolution, polygon number, scene complexity, and memory requirements. The balanced parallel renderer demonstrates scalable performance with respect to increase in graphic complexity and in machine size. Our parallel renderer outperforms Crow's scheme in benchmark experiments performed. The improvements are made in three fronts: 1) reduction in rendering time, 2) higher efficiency with balanced workload, and 3) adaptive to availablebuffer memory size. The balanced renderer can be more cost-effectively embedded within many 3D graphics algorithms, such as those for edge smoothing and 3D visualization. Our parallel renderer is MPI-coded, offering high portability and cross-platform performance. These advantages can greatly improve the QoS in 3D imaging and in real-time interactive graphics.}
}

@Article{got01:mpi-openmp-app,
author = {S. Gottlieb and S. Tamhankar},
title = {Benchmarking {MILC} code with {OpenMP} and {MPI}},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 2001,
number = 94,
pages = {841--845},
month = MAR,
abstract = {A trend in high performance computers that is becoming increasingly popularis the use of symmetric multiprocessing (SMP) rather than the older paradigm of MPP. MPI codes that ran and scaled well on MPP machines can often be run on an SR;IP machine using the vendor's version of MPI. However, this approach may not make optimal use of the (expensive) SMP hardware. More significantly, there are machines like Blue Horizon, an IBM SP with 8-way. SMP nodes at the San Diego Supercomputer Center that carl only support 4 MPI processes per node (with the current switch). On such a machine it is imperative to be able to use OpenMP parallelism on the node, and MPI between nodes.We describe the challenges of converting MILC MPI code to using a second level of OpenMP parallelism, and benchmarks on IBM and Sun computers.}
}


@Article{cha00:mpi-app,
author = {T. Chan and V. Eijkhout},
title = {Design of a library of parallel preconditioners},
journal = {International Journal of High Performance Computing Applications},
year = 2000,
volume = 14,
number = 2,
pages = {91--101},
month = {Summer},
abstract = {The authors outline the design principles underlying the ParPre library of parallel preconditioners. ParPre is a message-passing library of distributed preconditioners for linear systems, written using MPI and Petsc. It comprises Schwarz methods, Schur system domain decompositioning, various parallel incomplete factorizations, and multilevel methods.}
}


@Article{gro00:mpi-app,
author = {W. Gropp and D. Keyes and L. C. McInnes and M. D. Tidriri},
title = {Globalized {N}ewton-{K}rylov-{S}chwarz algorithms and software for parallel implicit {CFD}},
journal = {International Journal of High Performance Computing Applications},
year = 2000,
volume = 14,
number = 2,
pages = {102--136},
month = {Summer},
abstract = {Implicit solution methods are important in applications modeled by PDEs with disparate temporal and spatial scales. Because such applications require high resolution with reasonable turnaround, parallelization is essential. The pseudo-transient matrix-free Newton-Krylov-Schwarz (Psi NKS) algorithmicframework is presented as a widely applicable answer. This article shows that for the classical problem of three-dimensional transonic Euler flow about an M6 wing, Psi NKS can simultaneously deliver globalized, asymptotically rapid convergence through adaptive pseudo-transient continuation and Newton's method; reasonable parallelizability for an implicit method through deferred synchronization and favorable communication-to-computation scaling in the Krylov linear solver; and high per processor performance through attention to distributed memory and cache locality, especially through the Schwarz preconditioner. Two discouraging features of Psi NKS methods are their sensitivity to the coding of the underlying PDE discretization and the large number of parameters that must be selected to govern convergence. The authors therefore distill several recommendations from their experience and reading of the literature on various algorithmic components of Psi NKS, and they describe a freely available MPI-based portable parallel software implementation of the solver employed here.}
}


@Article{man01:mpi-app-perf,
author = {J. W. Manke and G. D. Kerlick and D. Levine and S. Banerjee and E. Dillon},
title = {Parallel performance of two applications in the {B}oeing high performance computing benchmark suite},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 4,
pages = {457--475},
month = MAR,
abstract = {We describe our work to evaluate the performance of the parallel versions of two floating-point-intensive engineering applications from Boeing's high performance computing benchmark suite (BHPCBS) on emerging RISC parallel systems and PC clusters. The first application is a computational fluid dynamics (CFD) code, OVERFLOW, developed by NASA. and used by Boeing for analysis and design of advanced aircraft. The second application is a prototype ofa computational electromagnetics (CEM) code, developed by Boeing and used for radar cross-section studies. The distributed memory parallel versions of both applications use the message passing interface (MPI) standard for message passing. The goal of our work was to determine whether RISC parallel systems and PC clusters, which offer high performance at low cost, may be able to meet Boeing's computing requirements in the future. We describe the test environments for the studies, discuss parallelization issues and strategies and present performance data for the two applications.}
}


@Article{bag01:mpi-perf,
author = {R. Bagrodia and E. Deelman and T. Phan},
title = {Parallel simulation of large-scale parallel applications},
journal = {International Journal of High Performance Computing Applications},
year = 2001,
volume = 15,
number = 1,
pages = {3--12},
month = {Spring},
abstract = {Accurate and efficient simulation of large parallel applications can be facilitated with the use of direct execution and parallel discrete-event simulation. This paper describes MPI-SIM, a direct execution-driven parallel simulator designed to predict the performance of existing MPI and MPI-IO application. MPI-SIM can be used to predict the performance of these programs asa function of architectural characteristics, including number of processors, message communication latencies, caching algorithms, and alternative implementations of collective I/O operations. Results are presented, which show the use of MPI-SIM in performing a scalability study of real-world applications. The benchmarks chosen for the study include Sweep3D, one of the ASCI benchmarks, and BTIO, an I/O-intensive benchmark from the NAS Parallel Benchmark suite. MPI-SIM is shown to accurately and efficiently predict the performance of Sweep3D running on an Origin 2000. It is also used to demonstrate the impact of the number of I/O nodes on BTIO's performance.}
}


@Article{hoe01:mpi-openmp,
author = {J. Hoeflinger and P. Alavilli and T. Jackson and B. Kuhn},
title = {Producing scalable performance with {OpenMP}: {E}xperiments with two {CFD} applications},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 4,
pages = {391--413},
month = MAR,
abstract = {OpenMP is a relatively new programming paradigm, which can easily deliver good parallel performance for small numbers ($<16$) of processors. Success with more processors is more difficult to produce. MPI is a relatively mature programming paradigm, and there have been many reports of highly scalable MPI codes for large numbers (hundreds, even thousands) of processors. In this paper, we explore the causes of poor scalability with OpenMP from two points of view. First, we incrementally transform the loops in a combustion application until we achieve reasonably good parallel scalability, and chronicle the effect of each step. Then, we approach scalability from the other direction by transforming a highly scalable program simulating the core flowof a solid-fuel rocket engine (originally written with MPI calls) directlyto OpenMP, and report the barriers to scalability that were detected. The list of incremental transformations includes well-known techniques such as loop interchange and loop fusion, plus new ones which make use of the unique features of OpenMP, such as barrier removal and the use of ordered serialloops. The list of barriers to scalability includes the use of the ALLOCATE statement within a parallel region, as well as the lack of a reduction clause for a PARALLEL region in OpenMP. We conclude with a list of key issueswhich need to be addressed to make OpenMP a more easily scalable paradigm.Some of these are OpenMP implementation issues; some are language issues.}
}

@Article{wal01:mpi-app,
author = {R. L. Walker},
title = {Search engine case study: searching the web using genetic programming and {MPI}},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 1,
pages = {71--89},
month = JAN,
abstract = {The generation of a Web page follows distinct sources for the incorporationof information. The earliest format of these sources was an organized display of known information determined by the page designers' interest and/or design parameters. The sources may have been published in books or other printed literature, or disseminated as general information about the page designer. Due to a growth in Web pages, several new search engines have been developed in addition to the refinement of the already existing ones. The use of the refined search engines, however, still produces an array of diverse information when the same set of keywords are used in a Web search. Some degree of consistency in the search results can be achieved over a period of time when the same search engine is used, yet, most initial Web searches on a given topic are treated as final after some form of refinement/adjustment of the keywords used in the search process. To determine the applicability of a genetic programming (GP) model for the diverse set of Web documents, search strategies behind the current search engines for the World Wide Web were studied. The development of a GP model resulted in a parallel implementation of a pseudosearch engine indexer simulator. The training sets used in this study provided a small snapshot of the computational effort required to index Web documents accurately and efficiently. Future results will be used to develop and implement Web crawler mechanisms that are capable of assessing the scope of this research effort, The GP model results were generated on a network of SUN workstations and an IBM SP2.}
}


@Article{dij01:mpi-app,
author = {F. Dijkstra and J. H. van Lenthe},
title = {Software news and updates - Parallel valence bond},
journal = {Journal of Computational Chemistry},
year = 2001,
volume = 6,
number = 22,
pages = {665--672},
month = APR,
abstract = {A parallel version of the valence bond program TURTLE has been developed. In this version the calculation of matrix elements is distributed over the processors. The implementation has been done using the message-passing interface (MPI), and is, therefore, portable. The parallel version of the program is shown to be quite efficient with a speed-up of 55 at 64 processors.}
}


@Article{den01:mpi-sys,
author = {Y. F. Deng and A. Korobka},
title = {The performance of a supercomputer built with commodity components},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = {1--2},
pages = {91--108},
month = JAN,
abstract = {We built a supercomputer called Galaxy by connecting Intel Pentium-based computer nodes with Fast and Gigabit Ethernet switches. Each node has two processors at clock speeds varying from 300 to 600 MHz, up to 512 MB of memory, and small 2 Gb local disk. All nodes run the standard RedHat Linux and inter-node communication is handled by a message passing interface called MPI. Local tools are written to visualize the system performance and to balance loads. We have benchmarked a sub-Galaxy with 72 processors by NAS and Parallel LINPACK benchmark suites. We achieved 16.9 Gflops in a standard single precision LU decomposition for 46848 x 46838 matrix parallel LINPACK benchmark. A Galaxy with 128 processors costs approximately \$250 000 and it delivers 40 Gflops of performance. This leads to a cost-performance ratio of 160 Kflops-per-dollar, which is to improve further due to increase in processor speeds and network bandwidth at similar cost. Our final system with 512 processors is expected to reach several Tflops. This article first describes the Galaxy architectural details, and then present and analyze its performance in terms of floating point number crunching, network bandwidth, and IO throughput.}
}


@Article{cap01:mpi-smp-perf,
author = {F. Cappello and O. Richard and D. Etiemble},
title = {Understanding performance of {SMP} clusters running {MPI} programs},
journal = {Future Generation Computer Systems},
year = 2001,
volume = 17,
number = 6,
pages = {711-720},
month = APR,
abstract = {Clusters of multiprocessors (CLUMPs) have an hybrid memory model, with message passing between nodes and shared memory inside nodes. We examine the performance of Myrinet clusters of SMP PCs when using a single memory model (SMM) based on the MPICH-PM/CLUMP library of the RWCP, which can directly use the MPI programs written for a cluster of uniprocessors. The specificities of the communication patterns with the SMM approach are detailed. PC clusters with 2-way and 4-way nodes are considered and compared.}
}


@Article{he01:mpi-app,
author = {Y. He and C. H. Q. Ding},
title = {Using accurate arithmetics to improve numerical reproducibility and stability in parallel applications},
journal = {Journal of Supercomputing},
year = 2001,
volume = 18,
number = 3,
pages = {259--277},
month = MAR,
abstract = {Numerical reproducibility and stability of large scale scientific simulations, especially climate modeling, on distributed memory parallel computers are becoming critical issues. In particular, global summation of distributedarrays is most susceptible to rounding errors, and their propagation and accumulation cause uncertainty in final simulation results. We analyzed several accurate summation methods and found that two methods are particularly effective to improve (ensure) reproducibility and stability: Kahan's self-compensated summation and Bailey's double-double precision summation. We provide an MPI operator MPI_SUMDD to work with MPI collective operations to ensure a scalable implementation on large number of processors. The final methods are particularly simple to adopt in practical codes: not only global summations, but also vector-vector dot products and matrix-vector or matrix-matrix operations.}
}

@Article{pro01:mpi-impl,
author = {B. V. Protopopov and A. Skjellum},
title = {A multithreaded message passing interface ({MPI}) architecture: Performance and program issues},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 4,
pages = {449--466},
month = APR,
Abstract = {This paper discusses a multithreaded software architecture for message-passing interface (MPI) software specification. The architecture is thread-safe, allows for concurrent communication over several communications media (multifabric communication), efficiently utilizes available hardware concurrency over a wide range of target platforms, and allows for concurrent communication and computation within the limits imposed by the hardware. The architecture is developed in the framework of the MPICH software architecture, awell-known MPI implementation used worldwide. The proposed architecture adopts wide portability of the MPICH design and remedies some of its deficiencies such as inefficient multifabric communication and non-thread-safety. The paper also considers the issues concerning development of high-performance portable message-passing systems for general-purpose architectures. The contributions of the paper are improving architecture and addressing threadsafely of modern reliable messaging software, as well as identifying and taking advantage of inherent concurrency in the message-passing software itself.}
}


@Article{cun01:mpi-app,
author = {R. D. da Cunha and A. L. de Bortoli},
title = {A parallel {N}avier-{S}tokes solver for the rotating flow problem},
journal = {Concurrency and Computation-Practice \& Experience},
year = 2001,
volume = 13,
number = 3,
pages = {163--180},
month = MAR,
abstract = {In this paper, we investigate the parallel solution of rotating internal flow problems, using the Navier-Stokes equations as proposed by Speziale and Thangam (in 1983) and Speziale (in 1985), A Runge-Kutta time-stepping scheme was applied to the equations and both sequential and message-passing implementations were developed, the latter using MPI, and were tested on a four-processor SGI Origin200 distributed, global shared memory parallel computer and on a 32-processor IBM 9076 SP/2 distributed memory parallel computer.The results show that our approach to parallelize the sequential implementation requires little effort whilst providing good results even for medium-sized problems.}
}

@Article{swan01:mpi-app,
author = {C. A. Swann},
title = {Software for parallel computing: The {LAM} implementation of {MPI}},
journal = {Journal of Applied Econometrics},
year = 2001,
volume = 16,
number = 2,
pages = {185--194},
month = {Mar-Apr},
abstract = {Many econometric problems can benefit from the application of parallel computing techniques, and recent advances in hardware and software have made such application feasible. There are a number of freely available software libraries that make it possible to write message passing parallel programs using personal computers or Unix workstations. This review discusses one of these-the LAM (Local Area Multiprocessor) implementation of MPI (the MessagePassing Interface).}
}


@Article{reis01:mpi-app,
author = {T. G. Reisin and S. C. Wurzler},
title = {Implementation of a numerical solution of the multicomponent kinetic collection equation ({MKCE}) on parallel computers},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 5,
pages = {641--661},
month = MAY,
abstract = {Two different numerical solutions of the two-component kinetic collection equation were implemented on parallel computers. The parallelization approach included domain decomposition and MPI commands for communications. Four different parallel codes were tested. A dynamic decomposition based on an occupancy function provided the optimum balance between time performance and flexibility for ally number of processors. The occupancy function was defined according to the number of calculations required at each grid point in the domain. Speed-up performance depended very much on the parallel code used and in some cases very good results were obtained for up to 32 processors.}
}

@Article{guif01:mpi-app,
author = {C. Guiffaut and K. Mahdjoubi},
title = {A parallel {FDTD} algorithm using the {MPI} library},
journal = {IEEE Antennas and Propagation Magazine},
year = 2001,
volume = 43,
number = 2,
pages = {94--103},
month = APR,
abstract = {In this paper, we describe the essential elements of a parallel algorithm for the FDTD method using the MPI (Message Passing Interface) library. To simplify and accelerate the algorithm, an MPI Cartesian 2D topology is used. The inter-process communications are optimized by the use of derived data types. A general approach is also explained for parallelizing the auxiliary tools, such as far-field computation, thin-wire treatment, etc. For PMLs, we have used a new method that makes it unnecessary to split the field components. This considerably simplifies the computer programming, and is compatible with the parallel algorithm.}
}

@Article{yao01:mpi-app,
author = {J. X. Yao and A. Jameson and J. J. Alonso and F. Liu},
title = {Development and validation of a massively parallel flow solver for turbomachinery flows},
journal = {Journal of Propulsion and Power},
year = 2001,
volume = 17,
number = 3,
pages = {659--668},
month = {May-June},
abstract = {The development and validation of the unsteady, three-dimensional, multiblock, parallel turbomachinery how solver TFLO is presented, The unsteady Reynolds-averaged Navier-Stokes equations are solved using a cell-centered discretization on arbitrary multiblock meshes. The solution procedure is based on efficient explicit Runge-Kutta methods with several convergence acceleration techniques such as multigrid, implicit residual smoothing, and local time stepping. The solver is parallelized using domain decomposition, a single program multiple data strategy, and the message passing interface standard, Details of the communication scheme and load balancing algorithms are discussed. A general and efficient procedure for parallel interblade row interfacing is developed. The dual-time stepping technique is used to advance unsteady computations in time, The focus is on improving the parallel efficiency and scalability of the flow solver, as well as on its initial validation of steady-state calculations in multiblade row environment. The result of this careful implementation is a solver with demonstrated scalability upto 1024 processors. For validation and verification purposes, results fromTFLO are compared with both existing experimental data and computational results from other computational fluid dynamics codes used in aircraft engine industry.}
}


@Article{kom01:mpi-app,
author = {Y. Komeiji and M. Haraguchi and U. Nagashima},
title = {Parallel molecular dynamics simulation of a protein},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 8,
pages = {977-987},
month = JUL,
abstract = { Program for energetic analysis of biochemical molecules (PEACH) is a software package for molecular dynamics (MD) simulation of biological molecules. The subroutines for the nonbonded interactions were modified to allow parallel computation by using the MPI library. The parallel efficiencies of the modified subroutines were close to 90\% or better when using 32 processors of an IBM SP computer. The total performance was comparable to that of the special-purpose computer MD-GRAPE with 8 LSI chips.
}
}


@Article{mar01:mpi-style,
author = {B. Di Martino and A. Mazzeo and N. Mazzocca and U. Villano},
title = {Parallel program analysis and restructuring by detection of point-to-point interaction patterns and their transformation into collective communication constructs},
journal = {Science of Computer Programming},
year = 2001,
volume = 40,
number = {2-3},
pages = {235--263},
month = JUL,
abstract = { After the presentation of the basic program analysis technique, several examples involving the detection of common communication patterns are shown. Then the structure of PPAR, a prototype tool that allows the analysis of parallel programs written in Fortran 77 with calls to PVM or MPI unstructured communication primitives is outlined, and conclusions are drawn.}
}


@Article{lee01:mpi-app,
author = {M. Lee and W. Liu and V. K. Prasanna},
title = {Parallel implementation of a class of adaptive signal processing applications},
journal = {Algorithmica},
year = 2001,
volume = 30,
number = 4,
pages = {645--684},
month = AUG,
abstract = {In this paper we present a methodology for mapping a class of adaptive signal processing applications onto KPC platforms such that the throughput performance is optimized. We first define a new task model using the salient computational characteristics of a class of adaptive signal processing applications. Based on this task model, we propose a new execution model. In the earlier linear pipelined execution model, the task mapping choices were restricted. The new model permits flexible task mapping choices, leading to improved throughput performance compared with the previous model. Using the new model, a three-step task mapping methodology is developed. It consists of (1) a data remapping step, (2) a coarse resource allocation step, and (3)a fine performance tuning step. The methodology is demonstrated by designing parallel algorithms for modern radar and sonar signal processing applications. These are implemented on IBM SP2 and Cray T3E, state-of-the-art HPC platforms, to show the effectiveness of our approach. Experimental results show significant performance improvement over those obtained by previous approaches. Our code is written using C and the Message Passing Interface (MPI). Thus, it is portable across various HPC platforms.}
}


@Article{take01:mpi-eval,
author = {K. Takeda and N. K. Allsopp and J. C. Hardwick and P. C. Macey and D. A. Nicole and S. J. Cox and D. J. Lancaster},
title = {An assessment of {MPI} environments for windows {NT}},
journal = {Journal of Supercomputing},
year = 2001,
volume = 19,
number = 3,
pages = {315--323},
Abstract = {In this paper we evaluate the MPI environments currently available for Windows NT on the Intel IA32 and Compaq/DEC Alpha architectures. We present benchmark results for low-level communication and for the NAS Parallel Benchmarks to allow comparison with other systems, but our primary interest is determining real application performance and robustness in production cluster environments. For this we use PAFEC-FE, a large FORTRAN code for finite-element analysis. We present results from three MPI implementations, two architectures, and three networking technologies (10 and 100 Mbit/s Ethernet and 1 Gbit/s Myrinet).}
}


@Article{chun01:mpi-app,
author = {S. H. Chung and H. C. Kwon and K. R. Ryu and Y. Chung and H. Jang and C. A. Choi},
title = {Information retrieval on an {SCI}-based {PC} cluster},
journal = {Journal of Supercomputing},
year = 2001,
volume = 19,
number = 3,
pages = {251--265},
Abstract = {This article presents an efficient parallel information retrieval (IR) system which provides fast information service for the Internet users on low-cost high-performance PC-NOW environment. The IR system is implemented on a PC cluster based on the scalable coherent interface (SCI), a powerful interconnecting mechanism for both shared memory models and message-passing models. In the IR system, the inverted-index file (IIF) is partitioned into pieces using a greedy declustering algorithm and distributed to the cluster nodes to be stored on each node's hard disk. For each incoming user's query with multiple terms, terms are sent to the corresponding nodes which contain the relevant pieces of the IIF to be evaluated in parallel. The IR system is developed using a distributed-shared memory (DSM) programming technique based on the SCI. According to the experiments, the IR system outperforms anMPI-based IR system using Fast Ethernet as an interconnect. Speed-up of upto 5.0 was obtained with an 8-node cluster in processing each query on a 500,000-document IIF.}
}


@Article{pic01:mpi-app,
author = {S. M. Pickles and J. M. Brooke and F. C. Costen and E. Gabriel and M. Muller and M. Resch and S. M. Ord},
title = {Metacomputing across intercontinental networks},
journal = {Future Generation Computer Systems},
year = 2001,
volume = 17,
number = 8,
pages = {911--918},
month = JUN,
Abstract = {An intercontinental network of supercomputers spanning more than 10 000 miles and running challenging scientific applications was realized at the Supercomputing '99 (SC99) conference in Portland, OR using PACX-MPI and ATM PVCs. In this paper, we describe how we constructed the heterogeneous cluster of supercomputers, the problems we confronted in terms of multi-architecture and the way several applications handled the specific requirements of a metacomputer.}
}


@Article{sha01:mpi-model,
author = {H. Z. Shan and J. P. Singh},
title = {A comparison of {MPI}, {SHMEM} and cache-coherent shared address space programming models on a tightly-coupled multiprocessors},
journal = {International Journal of Parallel Programming},
year = 2001,
volume = 29,
number = 3,
pages = {283--318},
month = JUN,
Abstract = {We compare the performance of three major programming models on a modern, 64-processor hardware cache-coherent machine, one of the two major types of platforms upon which high-performance computing is converging. We focus on applications that are either regular, predictable or at least do not require fine-grained dynamic replication of irregularly accessed data. Within this class, we use programs with a range of important communication patterns. We examine whether the basic parallel algorithm and communication structuring approaches needed for best performance are similar or different among the models, whether some models have substantial performance advantages over others as problem size and number of processors change, what the sources ofthese performance differences are, where the programs spend their time, and whether substantial improvements can be obtained by modifying either the application programming interfaces or the implementations of the programming models on this type of tightly-coupled multiprocessor platform.}
}


@Article{dem01:mpi-extension,
author = {E. D. Demaine and I. Foster and C. Kesselman and M. Snir},
title = {Generalized communicators in the message passing interface},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2001,
volume = 16,
number = 6,
pages = {610--616},
month = JUN,
abstract = {We propose extensions to the Message Passing Interface (MPI) that generalize the MPI communicator concept to allow multiple communication endpoints per process, dynamic creation of endpoints, and the transfer of endpoints between processes. The generalized communicator construct can be used to express a wide range of interesting communication structures, including collective communication operations involving multiple threads per process, communications between dynamically created threads or processes, and object-oriented applications in which communications are directed to specific objects. Furthermore, this enriched functionality can be provided in a manner that preserves backward compatibility with MPI. We describe the proposed extensions, illustrate their use with examples, and describe a prototype implementation in the popular MPI implementation MPICH.}
}


@Article{tro01:mpi-app,
author = {R. Trobec and M. Sterk and M. Praprotnik and D. Janezic},
title = {Implementation and evaluation of {MPI}-based parallel {MD} program},
journal = {International Journal of Quantum Chemistry},
year = 2001,
volume = 84,
number = 1,
pages = {23--31},
month = JUL,
abstract = {The message-passing interface (MPI)-based object-oriented particle-particleinteractions (PPI) library is implemented and evaluated. The library can be used in the ii-particle simulation algorithm designed for a ring of p interconnected processors. The parallel simulation is scalable with the numberof processors, and has the time requirement proportional to n(2)/p if n/p is large enough, which guarantees optimal speedup. III a certain range of problem sizes, the speedup becomes superlinear because enough cache memory is available in the system. The library is used in a simple way by any potential user, even with no deep programming knowledge. Different simulations using particles can be implemented on a wide spectrum of different computer platforms. The main purpose of this article is to test the PPI library on well-known methods, e.g., the parallel molecular dynamics (MD) simulation ofthe monoatomic system by the second-order leapfrog Verlet algorithm. The performances of the parallel simulation program implemented with the proposed library are competitive with a custom-designed simulation code. Also, theimplementation of the split integration symplectic method, based on the analytical calculation of the harmonic part of the particle interactions, is shown, and its expected performances are predicted.}
}


@Article{ahm01:mpi-alg,
author = {I. Ahmad},
title = {A distributed algorithm for finding prime compatibles on network of workstations},
journal = {Microprocessors and Microsystems},
year = 2001,
volume = 25,
number = 4,
pages = {195--202},
month = JUN,
abstract = {State minimization of incompletely specified finite state machines (FSMs)isan important step of FSM synthesis. Generation of prime compatibles is oneof the core steps in state minimization of incompletely specified FSMs. Itis guaranteed that a minimal solution exist, consisting of prime compatibles only. But the generation of prime compatibles is both a compute-intensive and a memory-intensive problem. In this paper, we have developed and implemented a distributed algorithm, designated as D_Prime, to find prime compatibles on network of workstations (NOWs) under message passing interface (MPI) environment to handle the large complexity of VLSI designs in future. With the advent of high-speed networks and availability of powerful high-performance workstations, NOW has emerged as the most cost-effective platform for compute-intensive and memory-intensive applications. Comparison of results on a number of MCNC benchmarks for state minimization of incompletely specified FSMs showed that a considerable speedup can be achieved by the proposed distributed algorithm as compared with the existing sequential counterparts.}
}


@Article{ino01:mpi-model,
author = {F. Ino and N. Fujimoto and K. Hagihara},
title = {{LogGPS}: A parallel computational model for synchronization analysis},
journal = {ACM SIGPLAN Notices},
year = 2001,
volume = 36,
number = 7,
pages = {133--142},
month = JUL,
abstract = {We also present some experimental results using both models. The results include (1) a verification of the LogGPS model, (2) an example of synchronization analysis using an MPI program and (3) a comparison of the models. The results indicate that the LogGPS model is more accurate than the LogGP model, and analyzing synchronization costs is important when improving parallel program performance.}
}


@Article{zha01:mpi-app,
author = {W. S. Zhang and G. Q. Zhang},
title = {Prestack depth migration by hybrid method with high precision and its parallel implementation},
journal = {Chinese Journal of Geophysics-Chinese Edition},
year = 2001,
volume = 44,
number = 4,
pages = {542--551},
month = JUL,
abstract = {Prestack depth migration is an important imaging method for complex geological structures. In this paper a generalized system of wavefield continuation is presented based on the wavefield splitting theory. The system is coupled by downgoing and upgoing waves, and the commonly used equation of wavefield continuation is only a special case of the coupled system. Based on theapproximation of square root operator, a new hybrid migration method with high precision is derived. The method can be implemented numerically through splitting technique. Finally, two numerical migration examples are given,one is the poststack depth migration for a model with large lateral velocity contrasts, another is the prestack depth migration for Marmousi model with complex structures. The numerical results show the effectiveness and high precision of the method. The MPI parallel calculation is adopted in orderto raise computational efficiency. The method can be used to obtain precise images for complex structures with large lateral velocity variations.}
}



@Article{ant01:mpi-xxx,
author = {G. Antoniu and L. Bouge and P. Hatcher and M. MacBeth and K. McGuigan and R. Namyst},
title = {The {H}yperion system: Compiling multithreaded {J}ava bytecode for distributed execution},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 10,
pages = {1279--1297},
month = SEP,
abstract = {Our work combines Java compilation to native code with a run-time library that executes Java threads in a distributed memory environment. This allows a Java programmer to view a cluster of processors as executing a single JAVA virtual machine. The separate processors are simply resources for executing Java threads with true parallelism, and the run-time system provides theillusion of a shared memory on top of the private memories of the processors. The environment we present is available on top of several UNIX systems and can use a large variety of communication interfaces thanks to the high portability of its run-time system. To evaluate our approach, we compare serial C, serial Java, and multithreaded Java implementations of a branch-and-bound solution to the minimal-cost map-coloring problem. All measurements have been carried out on two platforms using two different communication interfaces: SISCI/SCI and MPI-BIP/Myrinet.}
}

@Article{sar01:mpi-app,
author = {K. C. Sarma and H. Adeli},
title = {Bilevel parallel genetic algorithms for optimization of large steel structures},
journal = {Computer-Aided Civil and Infrastructure Engineering},
year = 2001,
volume = 16,
number = 5,
pages = {295--304},
month = SEP,
abstract = {This article is concerned with optimization of very large steel structures subjected to the actual constraints of the American Institute of Steel Construction ASD and LRFD specifications on high-performance multiprocessor machines using biologically inspired genetic algorithms. First, parallel fuzzygenetic algorithms (GAs) are presented for optimization of steel structures using a distributed memory Message Passing Interface (MPI) with two different schemes: the processor farming scheme and the migration scheme. Next, two bilevel parallel GAs are presented for large-scale structural optimization through judicious combination of shared memory data parallel processingusing the OpenMP Application Programming Interface (API) and distributed memory message passing parallel processing using MPI. Speedup results are presented for parallel algorithms.}
}

@Article{yil01:mpi-app,
author = {E. Yilmaz, E and M. S. Kavsaoglu and H. U. Akay and I. S. Akmandor},
title = {Cell-vertex based parallel and adaptive explicit 3{D} flow solution on unstructured grids},
journal = {International Journal of Computational Fluid Dynamics},
year = 2001,
volume = 14,
number = 4,
pages = {271--286},
abstract = {A parallel adaptive Euler flow solution algorithm is developed for 3D applications on distributed memory computers. Significant contribution of this research is the development and implementation of a parallel grid adaptationscheme together with an explicit cell vertex-based finite volume 3D flow solver on unstructured tetrahedral grids. Parallel adaptation of grids is based on grid-regeneration philosophy by using an existing serial grid generation program. Then, a general partitioner repartitions the grid. An adaptive sensor value, which is a measure to refine or coarsen grids, is calculated considering the pressure gradients in all partitioned blocks of grids. The parallel performance of the present study was tested. Parallel computations were performed on Unix workstations and a Linux cluster using MPI communication library. The present results show that overall adaptation scheme developed in this study is applicable to any pair of a flow solver and grid generator with affordable cost. It is also proved that parallel adaptation is necessary for accurate and efficient flow solutions.}
}


@Article{cha01:mpi-app,
author = {H. Y. Chang and K. C. Huang and C. Y. Shen and S. C. Tcheng and C. Y. Chou},
title = {Parallel computation of a weather model in a cluster environment},
journal = {Computer-Aided Civil and Infrastructure Engineering},
year = 2001,
volume = 16,
number = 5,
pages = {365--373},
month = SEP,
abstract = {Recently, the superior and continuously improving cost-performance ratio ofcommodity hardware and software has made PC clustering a popular alternative for high-performance computing in both academic institutes and industrial organizations. The purpose of this work is to use PC clusters to solve a weather-prediction model in parallel mode, and the result also will be compared with those obtained on some conventional parallel platforms such as the Fujitsu VPP300, IBM SP2 (160 and 120 MHz), and HP SPP2200. Techniques of domain decomposition and data communication are used to exploit parallelismof the model. Interprocessor data communication is done by the Message Passing Interface communication library routines. Two versions of the parallelcodes, one with longitude decomposition and the other with latitude decomposition, are tested and compared. Speedups of the parallel weather model onthese machines with various numbers of processors show that substantial reductions in computation time can be achieved as compared with sequential runs.}
}

@Article{bgl00:mpi-impl,
author = {Ralph Butler and William Gropp and Ewing Lusk},
title = {Components and Interfaces of a Process Management System for Parallel Programs},
journal = {Parallel Computing},
month = OCT,
year = 2001,
volume = 27,
number = 11,
pages = {1417--1429},
abstract = {Parallel jobs are different from sequential jobs and require a different type of process management. We present here a process management system for parallel programs such as those written using MPI. A primary goal of the system, which we call MPD (for multipurpose daemon), is to be scalable. By this we mean that startup of interactive parallel jobs comprising thousands ofprocesses is quick, that signals can be quickly delivered to processes, and that stdin, stdout, and stderr are managed intuitively. Our primary target is parallel machines made up of clusters of SMPs, but the system is also useful in more tightly integrated environments. We describe how MPD enablesfast startup and convenient runtime management of parallel jobs. We show how close control of stdio can support the easy implementation of a number of convenient system utilities, even a parallel debugger. We describe a simple but general interface that can be used to separate any process manager from a parallel library, which we use to keep MPD separate from MPICH.}
}


@Article{myl01:mpi2-impl,
author = {S. Moh and C. S. Yu and B. Lee and H. Y. Youn and D. S. Han and D. Lee},
title = {Four-ary tree-based barrier synchronization for 2{D} meshes without nonmember involvement},
journal = {IEEE Transactions on Computers},
year = 2001,
volume = 50,
number = 8,
pages = {811-823},
month = AUG,
abstract = {This paper proposes a Barrier Tree for Meshes (BTM) to minimize the barrier synchronization latency for two-dimensional (2D) meshes. The proposed BTM scheme has two distinguishing features. First, the synchronization tree is 4-ary. The synchronization latency of the BTM scheme is asymptotically Theta (log(4) n), while that of the fastest scheme reported in the literature is bounded between Omega (log(3) n) and O(n(1/2)), where n is the number of member nodes. Second, nonmember nodes are neither involved in the construction of a BTM nor actively participate in the synchronization operations, which avoids interference among different process groups during synchronization. This not only results in low setup overhead, but also reduces the synchronization latency. The low setup overhead is particularly effective for the dynamic process model provided in MPI-2. Extensive simulation study showsthat, for up to 64 x 64 meshes, the BTM scheme results in about 40 similarto 70 percent shorter synchronization latency and is more scalable than conventional schemes.}
}


@Article{fbd01:mpi-impl,
author = {G. E. Fagg and A. Bukovsky and J. J. Dongarra},
title = {{HARNESS} and fault tolerant {MPI}},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 11,
pages = {1479--1495},
month = OCT,
abstract = {Initial versions of MPI were designed to work efficiently on multi-processors which had very little job control and thus static process models. Subsequently forcing them to support a dynamic process model would have affected their performance. As current HPC systems increase in size with greater potential levels of individual node failure, the need arises for new fault tolerant systems to be developed. Here we present a new implementation of MPI called fault tolerant MPI (FT-MPI) that allows the semantics and associatedmodes of failures to be explicitly controlled by an application via a modified MPI API. Given is an overview of the FT-MPI semantics, design, exampleapplications, debugging tools and some performance issues. Also discussed is the experimental HARNESS core (G\_HCORE) implementation that FT-MPI is built to operate upon.}
}


@Article{kbg01:mpi-impl,
author = {T. Kielmann and H. E. Bal and S. Gorlatch and K. Verstoep and R. F. H. Hofman},
title = {Network performance-aware collective communication for clustered wide-area systems},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 11,
pages = {1431--1456},
month = OCT,
abstract = {Metacomputing infrastructures couple multiple clusters (or MPPs) via wide-area networks. A major problem in programming parallel applications for suchplatforms is their hierarchical network structure: latency and bandwidth of WANs often are orders of magnitude worse than those of local networks. Our goal is to optimize MPI's collective operations for such platforms. We use two techniques: selecting suitable communication graph shapes, and splitting messages into multiple segments that are sent in parallel over different WAN links. To optimize graph shape and segment size at runtime, we introduce a performance model called Parameterized Log P (P - Log P), a hierarchical extension of the Log P model that covers messages of arbitrary length. An experimental performance evaluation shows that the newly implemented collective operations have significantly improved performance for large messages, and that there is a close match between the theoretical model and the measured completion times.}
}


@Article{ll01:mpi-openmp,
author = {G. R. Luecke and W. H. Lin},
title = {Scalability and performance of {OpenMP} and {MPI} on a 128-processor {SGI} {Origin2000}},
journal = {Concurrency and Computation-Practice \& Experience},
year = 2001,
volume = 13,
number = 10,
pages = {905--928},
month = AUG,
abstract = {The purpose of this paper is to investigate the scalability and performanceof seven, simple OpenMP test programs and to compare their performance with equivalent MPI programs on an SGI Origin 2000. Data distribution directives were used to make sure that the OpenMP implementation had the same data distribution as the MPI implementation. For the matrix-times-vector (test 5) and the matrix-times-matrix (test 7) tests, the syntax allowed in OpenMP 1.1 does not allow OpenMP compilers to be able to generate efficient code since the reduction clause is not currently allowed for arrays. (This problem is corrected in OpenMP 2.0.) For the remaining five tests, the OpenMP version performed and scaled significantly better than the corresponding MPI implementation, except for the right shift test (test 2) for a small message.}
}


@Article{pas01:mpi-app,
author = {G. Passoni and P. Cremonesi and G. Alfonsi},
title = {Analysis and implementation of a parallelization strategy on a {N}avier-{S}tokes solver for shear flow simulations},
journal = {Parallel Computing},
year = 2001,
volume = 27,
number = 13,
pages = {1665--1685},
month = DEC,
abstract = {A parallel computational solver for the unsteady incompressible three-dimensional Navier-Stokes equations implemented for the numerical simulation of shear flow cases is presented. The computational algorithms include Fourierexpansions in the streamwise and spanwise directions, second-order centered finite differences in the direction orthogonal to the solid walls, third-order Runge-Kutta procedure in time in which both convective and diffusive terms are treated explicitly; the fractional step method is used for time marching. Based on the numerical algorithms implemented within the computational solver, three different (MPI based) parallelization strategies are devised. The three schemes are evaluated with particular attention to the impact of the communications onto the whole computational procedure, and one ofthem is implemented. Computations are executed on two different parallel machines and results are shown in terms of parallel performance. Processes using different number of processors combined with different number of computational grid points are tested.}
}


@Article{ber01:mpi-openmp,
author = {J. Y. Berthou and E. Fayolle},
title = {Comparing {OpenMP}, {HPF}, and {MPI} programming: A study case},
journal = {International Journal of High Performance Computing Applications},
year = 2001,
volume = 15,
number = 3,
pages = {297--309},
abstract = {This paper presents a comparison of three programming models-OpenMP, HPF, and MPI-applied to a diphasic compressible fluid mechanics code. The parallelization analysis is conducted, and the authors also present the experimental results obtained on various platforms: a Compaq Proliant 6000 (4 processors), a Cray T3E-750 (300 processors), an HP Class V (16 processors), a SG1Origin 2000 (32 processors), a cluster of PCs, and a COMPAQ SC 232 (232 processors). These experimental results will be discussed according to the following criteria: efficiency, scalability, maintainability, developing costs, and portability. As a conclusion, the authors present the parallelization strategy recommended for codes comparable to ECOSS.}
}


@Article{ber01:mpi-alg,
author = {L. Bergamaschi and I. Moret and G. Zilli},
title = {Inexact {Q}uasi-{N}ewton methods for sparse systems of nonlinear equations},
journal = {Future Generation Computer Systems},
year = 2001,
volume = 18,
number = 1,
pages = {41--53},
month = SEP,
abstract = {In this paper, we present the results obtained by solving consistent sparsesystems of n nonlinear equations F(x) = 0, by a Quasi-Newton method combined with a p block iterative row-projection linear solver of Cimmino type, 1 less than or equal to $p << n$. Under weak regularity conditions for F, it is proved that this Inexact Quasi-Newton method has a local, linear convergence in the energy norm induced by the preconditioned matrix HA, where A is an initial guess of the Jacobian matrix, and it may converge too superlinearly. The matrix H = [A(1)(+),...,A(i)(+),...,A(p)(+)], where A(i)(+) = A(i)(T)(A(i)A(i)(T))(-1) is the Moore-Penrose pseudo-inverse of the mi x n block A(i), the preconditioner. A simple partitioning of the Jacobian matrix was used for solving a set of nonlinear test problems with sizes ranging from 1024 to 131 072 on the CRAY T3E under the MPI environment.}
}



@Article{neo01:mpi-tool,
author = {N. Neophytou and P. Evripidou},
title = {{Net-dbx}: A web-based debugger of {MPI} programs over low-bandwidth lines},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2001,
volume = 12,
number = 9,
pages = {986--995},
month = SEP,
abstract = {This paper describes Net-dbx, a tool that utilizes Java and other World Wide Web tools for the debugging of MPI programs from anywhere in the Internet. Net-dbx is a source-level interactive debugger with the full power of gdb(the GNU Debugger) augmented with the debug functionality of the public-domain MPI implementation environments. The main effort was on a low overhead, yet powerful, graphical interface supported by low-bandwidth connections.The portability of the tool is of great importance as well because it enables the tool to be used on heterogeneous nodes that participate in an MPI multicomputer. Both needs are satisfied a great deal by the use of WWW browsing tools and the Java programming language. The user of our system simply points his/her browser to the Net-dbx page, logs in to the destination system, and starts debugging by interacting with the tool, just as with any GUIenvironment. The user can dynamically select which MPI processes to view/debug. A special WWW-based environment has been designed and implemented to host the system prototype.}
}


@Article{ree01:mpi-alg,
author = {J. S. Reeve and A. D. Scurr and J. H. Merlin},
title = {Parallel versions of {S}tone's strongly implicit algorithm},
journal = {Concurrency and Computation-Practice \& Experience,},
year = 2001,
volume = 13,
number = 12,
pages = {1049--1062},
month = OCT,
abstract = {In this paper, we describe various methods of deriving a parallel version of Stone's Strongly Implicit Procedure (SIP) for solving sparse linear equations arising from finite difference approximation to partial differential equations (PDEs). Sequential versions of this algorithm have been very successful in solving semi-conductor, heat conduction and flow simulation problems and an efficient parallel version would enable much larger simulations to be run. An initial investigation of various parallelizing strategies was undertaken using a version of high performance Fortran (HPF) and the best methods were reprogrammed using the MPI message passing libraries for increased efficiency. Early attempts concentrated on developing a parallel version of the characteristic wavefront computation pattern of the existing sequential SIP code. However, a red-black ordering of grid points, similar to that used in parallel versions of the Gauss-Seidel algorithm, is shown to be far more efficient. The results of both the wavefront and red-black MPI based algorithms are reported for various size problems and number of processors on a sixteen node IBM SP2.}
}


@Article{kre01:mpi-app,
author = {H. Kremer and F. May and S. Wirtz},
title = {The influence of furnace design on the {NO} formation in high temperature processes},
journal = {Energy Conversion and Management},
year = 2001,
volume = 42,
number = {15--17},
pages = {1937--1952},
month = {Oct-Nov},
abstract = {High temperature processes produce high NO, emissions due to their elevatedworking temperatures. Strong regulations for emissions of pollutants [1] from industrial plants lead the operators to optimize their furnaces. In this paper a three-dimensional mathematical model for turbulent flow and combustion on the basis of turbulence-chemistry interactions and radiative heat transfer taking into account spectral effects of surrounding walls and combustion gases is described. The transport equation for radiative intensity was split into different wavelength ranges. A block-structured finite volumegrid with local refinements was used to solve the governing equations. Thecalculation domain is subdivided into a number of subdomains which are linked within the solver based on the message passing interface (MPI) library.Computed distributions of velocity, temperature, species distribution and heat fluxes are given. Results of a parametric study in a producing horseshoe furnace by increasing the height of the furnace with regard to NO, concentration distributions are presented.}
}


@Article{he01:mpi-alg,
author = {X. He and C. H. Huang},
title = {Communication efficient {BSP} algorithm all nearest smaller values problem},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 10,
pages = {1425--1438},
month = OCT,
abstract = {We present a BSP (Bulk Synchronous Parallel) algorithm for solving the All Nearest Smaller Values Problem (ANSVP), a fundamental problem in both graphtheory and computational geometry. Our algorithm achieves optimal sequential computation time and uses only three communication supersteps. In the worst case, each communication phase takes no more than an (n/p + p)-relation, where p is the number of the processors. In addition, our average-case analysis shows that, on random inputs, the expected communication requirements for all three steps are bounded above by a p-relation, which is independent of the problem size n. Experiments have been carried out on an SGI Origin 2000 with 32 R10000 processors and a SUN Enterprise 4000 multiprocessing server supporting 8 UltraSPARC processors, using the MPI libraries. The results clearly demonstrate the communication efficiency and load balancing for computation.}
}


@Article{bea01:mpi-app,
author = {O. Beaumont and V. Boudet and F. Rastello and Y. Robert},
title = {Matrix multiplication on heterogeneous platforms},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2001,
volume = 12,
number = 10,
pages = {1033-1051},
month = OCT,
abstract = {In this paper, we address the issue of implementing matrix multiplication on heterogeneous platforms. We target two different classes of heterogeneouscomputing resources: heterogeneous networks of workstations and collections of heterogeneous clusters. Intuitively, the problem is to load balance the work with different speed resources while minimizing the communication volume. We formally state this problem in a geometric framework and prove itsNP-completeness. Next, we introduce a (polynomial) column-based heuristic,which turns out to be very satisfactory: We derive a theoretical performance guarantee for the heuristic and we assess its practical usefulness through MPI experiments.}
}


@Article{ban01:mpi-impl,
author = {M Banikazemi and R. K. Govindaraju and R. Blackmore and D. K. Panda},
title = {MPI-LAPI: An efficient implementation of MPI for IBM RS/6000 SP systems},
journal = {IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS},
year = 2001,
volume = 12,
number = 10,
pages = {1081--1093},
month = OCT,
abstract = {The IBM RS/6000 SP system is one of the most cost-effective commercially available high performance machines. IBM RS/6000 SP systems support the Message Passing Interface standard (MPI) and LAPI. LAPI is a low level, reliable, and efficient one-sided communication API library implemented on IBM IRS/6000 SP systems. This paper explains how the high performance of the LAPI library has been exploited in order to implement the MPI standard more efficiently than the existing MPI. It describes how to avoid unnecessary data copies at both the sending and receiving sides for such an implementation. The resolution of problems arising from the mismatches between the requirements of the MPI standard and the features of LAPI is discussed. As a result of this exercise, certain enhancements to LAPI are identified to enable an efficient implementation of MPI on LAPI. The performance of the new implementation of MPI is compared with that of the underlying LAPI itself. The latency (in polling and interrupt modes) and bandwidth of our new implementation is compared with that of the native MPI implementation on RS/6000 SP systems. The results indicate that the MPI implementation on LAPI performs comparably to or better than the original MPI implementation in most cases. Improvements of up to 17.3 percent in polling mode latency, 35.8 percent in interrupt mode latency, and 20.9 percent in bandwidth are obtained for certain message sizes. The implementation of MPI on top of LAPI also outperforms the native MPI implementation for the NAS Parallel Benchmarks.}
}


@Article{liRa01:mpi-app,
author = {M. Z. Li and O. F. Rana and D. W. Walker},
title = {Wrapping {MPI}-based legacy codes as {Java/CORBA} components},
journal = {Future Generation Computer Systems},
year = 2001,
volume = 18,
number = 2,
pages = {213--223},
month = OCT,
abstract = {Techniques for wrapping an MPI-based molecular dynamics (MD) simulation code as Java/CORBA components, for use within a distributed component based problem solving environment (CB-PSE), is presented. A legacy code for simulating a Lennard-Jones fluid is first wrapped as a single CORBA object, followed by division of the code into computational sub-units, where each sub-unit is wrapped as a CORBA object containing MPI calls, and run on a cluster of workstations - enabling different MPI implementations to inter-operate. Using a Java implementation, users can submit simulation tasks through a Webbased inter-face, without needing to know implementation details of the legacy code, or the exact interaction between sub-units within the code. We provide performance comparisons of wrapping the entire MD code as a single object versus wrapping sub-units within it, and offer a simple performance model to explain our findings.}
}

@Article{beau01:mpi-app,
author = {O. Beaumont and V. Boudet and A. Petitet and F. Rastello and Y. Robert},
title = {A proposal for a heterogeneous cluster {ScaLAPACK} (dense linear solvers)},
journal = {IEEE Transactions on Computers},
year = 2001,
volume = 50,
number = 10,
pages = {1052--1070},
month = OCT,
abstract = {In this paper, we study the implementation of dense linear algebra kernels,such as matrix multiplication or linear system solvers, on heterogeneous networks of workstations. The uniform block-cyclic data distribution scheme commonly used for homogeneous collections of processors limits the performance of these linear algebra kernels on heterogeneous grids to the speed of the slowest processor. We present and study more sophisticated data allocation strategies that balance the load on heterogeneous platforms with respect to the performance of the processors. When targeting unidimensional grids, the load-balancing problem can be solved rather easily. When targeting two-dimensional grids, which are the key to scalability and efficiency for numerical kernels, the problem turns out to be surprisingly difficult. We formally state the 2D load-balancing problem and prove its NP-completeness. Next, we introduce a data allocation heuristic, which turns out to be very satisfactory: Its practical usefulness is demonstrated by MPI experiments conducted with a heterogeneous network of workstations.}
}

@Article{corn01:mpi-app,
author = {C. F. Cornwell and L. T. Wille and Y. G. Wu and F. H. Sklar},
title = {Parallelization of an ecological landscape model by functional decomposition},
journal = {Ecological Modelling},
year = 2001,
volume = 144,
pages = {13-20},
month = OCT,
abstract = {A functional scheme is described to parallelize computer simulations of grid-based ecological landscape models. The method is implemented using the Message Passing Interface protocol and is applied to the Everglades LandscapeVegetation Model. On a two-processor system, the speed-up is satisfactory and the overall performance of the program is competitive with traditional parallelization techniques such as geometrical decomposition. The method isdiscussed, timing information is provided for three different parallel machines, and some further developments are indicated.}
}


@Article{sama01:mpi-app,
author = {M. Y. Saman and D. J. Evans},
title = {Distributed computing on cluster systems},
journal = {International Journal of Computer Mathematics},
year = 2001,
volume = 78,
number = 3,
pages = {383--397},
abstract = {Message Passing Interface (MPI) allows a group of computers in a network tobe specified as a cluster system. It provides the routines for task activation and communication. Writing programs for a cluster system is a difficult job. In this paper: the Message+passing Interface is presented. Parallel programs using the WMPI, a version of MPI, to solve the pi(pi) calculation the quick sort algorithm and the Torsion problem are presented. The programs are written and compiled in Microsoft Visual C++.}
}


@Article{raas01:mpi-app,
author = {S. Raasch and M. Schroter},
title = {{PALM}---{A} large-eddy simulation model performing on massively parallel computers},
journal = {Meteorologische Zeitschrift},
year = 2001,
volume = 10,
number = 5,
pages = {363--372},
abstract = {An existing code of a large-eddy simulation (LES) model for the study of turbulent processes in the atmospheric and oceanic boundary layer has been completely recoded for use on massively parallel systems with distributed memory. Parallelization is achieved by two-dimensional domain decomposition and communication is realized by the message passing interface (MPI). Periodic boundary conditions, which are used in both horizontal directions, helpedto minimize the parallelization effort. The performance of the new PArallelized LES Model (PALM) is excellent on SGI/Cray-T3E systems and an almost linear speed-up is achieved up to very large numbers of processors. Parallelization strategy and model performance is discussed and validation experiments as well as future applications are presented.}
}

@Article{lu01:mpi-app,
author = {P. Lu},
title = {Integrating bulk-data transfer into the {A}urora distributed shared data system},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 11,
pages = {1609--1632},
month = NOV,
abstract = {The Aurora distributed shared data system implements a shared-data abstraction on distributed-memory platforms, such as clusters, using abstract data types. Aurora programs are written in C++ and instantiate shared-data objects whose data-sharing behaviour can be optimized using a novel technique called scoped behaviour. Each object and each phase of the computation (i.e.,use-context) can be independently optimized with per-object and per-context flexibility. Within the scoped behaviour framework, optimizations such asbulk-data transfer can be implemented and made available to the application programmer. Scoped behaviour carries semantic information regarding the specific data-sharing pattern through various layers of software. We describe how the optimizations are integrated from the uppermost application-programmer layers down to the lowest UDP-based layers of the Aurora system. A bulk-data transfer network protocol bypasses some bottlenecks associated withTCP/IP and achieves higher performance on an ATM network than either TreadMarks (distributed shared memory) or MPICH (message passing) for matrix multiplication and parallel sorting.}
}

@Article{brig01:mpi-impl,
author = {R. Brightwell and S. Plimpton},
title = {Scalability and performance of two large {L}inux clusters},
journal = {Journal of Parallel and Distributed Computing},
year = 2001,
volume = 61,
number = 11,
pages = {1546--1569},
month = NOV,
abstract = {In this paper, we present performance results from several parallel benchmarks and applications on two large Linux clusters at Sandia National Laboratories. We compare the results on the Linux clusters to performance obtainedon a traditional distributed-memory massively parallel processing machine,the Intel TeraFLOPS. We discuss the characteristics of these machines thatinfluence the performance results and identify the key components of the system that are important to allow for scalability of commodity-based PC clusters to hundreds and possibly thousands of processors.}
}


@Article{diPi01:mpi-app,
author = {M. Di Pierro},
title = {Matrix distributed processing: a set of {C++} tools for implementing generic lattice computations on parallel systems},
journal = {Computer Physics Communications},
year = 2001,
volume = 141,
number = 1,
pages = {98--148},
month = NOV,
abstract = {We present a set of programming tools (classes and functions written in C++and based on Message Passing Interface) for fast development of generic parallel (and non-parallel) lattice simulations. They are collectively calledMDP 1. 2. These programming tools include classes and algorithms for matrices, random number generators, distributed lattices (with arbitrary topology), fields and parallel iterations. No previous knowledge of MPI is required in order to use them. Some applications in electromagnetism, electronics,condensed matter and lattice QCD are presented,}
}

@Article{ahan01:mpi-app,
author = {X. Zhang and B. Wang and Z. Z. Ji},
title = {Performance of a parallel finite difference atmospheric general circulation model},
journal = {Advances in Atmospheric Sciences},
year = 2001,
volume = 18,
number = 6,
pages = {1175--1184},
abstract = {A new version of the Institute of Atmospheric Physics (IAP) 9-Layer (9L) atmospheric general circulation model (AGCM) suitable for Massively Parallel Processor (MPP) has been developed. This paper presents the principles of the parallel code design and examines its performance on a variety of state-of-the-art parallel computers in China. Domain decomposition strategy is used to achieve parallelism that is implemented by Message Passing Interface (MPI). Only the one dimensional domain decomposition algorithm is shown to scale favorably as the number of processors is increased.}
}

@Article{boul01:mpi-app,
author = {C. Bouldin and J. Sims and H. Hung and J. J. Rehr and A. L. Ankudinov},
title = {Rapid calculation of x-ray absorption near edge structure using parallel computation},
journal = {X-Ray Spectrometry},
year = 2001,
volume = 30,
number = 6,
pages = {431--434},
month = {Nov.-Dec.},
abstract = {Modeling x-ray absorption near edge structure (XANES) requires computationally intensive calculations. We show that parallel processing can reduce thetime required for XANES calculations by a factor of up to 50 over standarddesktop computers. Parallel processing is implemented in our codes using the Message Passing Interface (MPI) and is portable across most hardware andoperating systems. We demonstrate the inverse scaling of the parallel algorithm with the number of processors, and discuss how this approach to parallel processing could be implemented in other multiple-scattering calculations. Faster calculations should improve the applicability of ab initio XANESstudies to many materials science problems.}
}

@Article{behr01:mpi-app,
author = {M. Behr},
title = {Stabilized space-time finite element formulations for free-surface flows},
journal = {Communications in Numerical Methods in Engineering},
year = 2001,
volume = 17,
number = 11,
pages = {813--819},
month = NOV,
abstract = {Aspects of a method for 3D finite element computation of unsteady, incompressible free-surface flow are presented. The approach is based on the deformable-spatial-domain/stabilized space-time (DSD/SST) finite element formulation, which takes automatically into account the deformation of the elementsin response to the motion of the free surface. The free-surface elevation is governed by a kinematic free-surface condition, which is also solved with a stabilized formulation. A new governing equation and stabilized formulation is derived for cases where the channel walls are not vertical. The parallel implementation based on MPI message-passing standard is fully portable, and have been demonstrated to be scalable on a range of architectures. A 3D computation of a flow past a spillway of a dam is shown as an example application.}
}


@Article{he02:mpi-app,
author = {F. S. He and H. Wu},
title = {An efficient parallel implementation of the {E}verglades {L}andscape {F}ire {M}odel using checkpointing},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 1,
pages = {65--82},
month = JAN,
abstract = {This paper presents a low-communication overhead and high-performance data parallelism implementation of the Everglades Landscape Fire Model (ELFM) ina network of workstations (NOWs). ELFM is parallelized under Message Passing Interface (MPI). Checkpointing and rollback technologies are used to handle the spread of fire which is a dynamic and irregular component of the model. A parallel application model with the mixture of a variety of asynchronous and synchronous computation is developed. In this model, the asynchronous computation is dominant and synchronous computation is intermittent. The length of each synchronous computation also varies. Based on the developed model, a synchronous check-pointing mechanism is used in the parallel ELFM code under MPI. A simulation is conducted and results show that the performance of the ELFM under MPI is significantly enhanced by the application of checkpointing and rollback. }
}

@Article{soda02:mpi-app,
author = {A. C. Sodan},
title = {Applications on a multithreaded architecture: A case study with {EARTH-MANNA}},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 1,
pages = {3--33},
month = JAN,
abstract = {Multithreading offers benefits with respect to the formulation of irregulardynamic programs and their dynamic scheduling, load balancing and interaction. Furthermore, low-cost communication on distributed-memory machines by remote-memory access is provided by some systems for efficient communication. EARTH is one of the few systems which combines both, while most other systems either focus on communication or provide multithreading in shared-memory environments. Dynamic irregular applications are often awkward to parallelize on distributed memory when using SPMD style programming via MPI and show different requirements for formulation. In addition, dynamic irregularapplications also may show a fairly tight data coupling. Systems like EARTH are beneficial then, because they specifically support large number of small data exchanges by providing short startup times and the tolerance of even small latencies (offering very fine-grain threads). However, static regular applications with tight data coupling are supported too. On the exampleof EARTH, this paper investigates the benefits of low-cost communication and multithreading, parallelizing three AI applications with medium to high communication intensity. We present experimental results obtained on the MANNA machine.}
}

@Article{wang02:mpi-app,
author = {P. Wang and K. Y. Liu and T. Cwik and R. Green},
title = {{MODTRAN} on supercomputers and parallel computers},
journal = {PARALLEL COMPUTING},
year = 2002,
volume = 28,
number = 1,
pages = {53--64},
month = JAN,
abstract = {To enable efficient reduction of large data sets such as is done in the Airborne Visible/Infrared Imaging Spectrometer (AVIRIS) project at the Jet Propulsion Laboratory (JPL), a high performance version of MODTRAN is essential. One means to accomplish this is to apply the computational resources of parallel computer systems. In our present work, a flexible, parallel version of MODTRAN has been implemented on the Cray T3E, the HP SPP2000, and a Beowulf-class cluster computer using domain decomposition techniques and the Message Passing Interface (MPI) library. In this paper, porting the sequential MODTRAN to various platforms is discussed; strategies of designing a parallel version of MODTRAN are developed; detailed implementation for a parallel MODTRAN is reported, and performance data of the parallel code on various computers are presented. Near linear scaling performance of parallel MODTRAN has been obtained, and comparisons of wallclock time are made among various supercomputers and parallel computers. The parallel version of MODTRAN gives excellent speedup, which dramatically reduces total data processing time for many applications such as the AVIRIS project at JPL.}
}

@Article{acac02:mpi-impl,
author = {M. Acacio and O. Canovas and J. M. Garcia and P. E. Lopez-de-Teruel},
title = {{MPI-Delphi}: an {MPI} implementation for visual programming environments and heterogeneous computing},
journal = {Future Generation Computer Systems},
year = 2002,
volume = 18,
number = 3,
pages = {317--333},
month = JAN,
abstract = {The goal of a parallel program is to reduce the execution time, compared tothe fastest sequential program solving the same problem. Parallel programming is growing due to the widespread use of network of workstations (NOWs) or powerful PCs in high-performance computing. Because the hardware components are all commodity devices, NOWs are much more cost-effective than custom machines with similar technology. In this environment, the typical programming model used has been message-passing and the MPI library has become the standard in the distributed-memory computing model. On the other hand, visual programming environments try to simply the task of developing applications. They provide programmers with several standard components for creating programs. Delphi constitutes one of the most popular visual programming environments nowadays in the Windows market place. In this paper, we presentMPI-Delphi, an implementation of MPI for writing parallel applications using Delphi visual programming environment. We show how MPI-Delphi has been developed, and how it makes possible to manage a cluster of homogeneous/heterogeneous PCs. Two examples of use of MPI-Delphi in a heterogeneous clusterof workstations with a mixture of Windows and Linux operating systems are also included. The MPI-Delphi interface is suitable for some specific kindsof problems, such as monitoring parallel programs of long execution time, or computationally intensive graphical simulations. In addition, MPI-Delphihas proven to be a good tool for research, as the development of new algorithms can be carried out quickly and, therefore, time spent on the debugging of such algorithms is reduced. Finally, we conclude by explaining some of the tasks we think MPI-Delphi is suitable for.}
}

@Article{thak02:mpi-impl,
author = {R. Thakur and W. Gropp and E. Lusk},
title = {Optimizing noncontiguous accesses in {MPI-IO}},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 1,
pages = {83--105},
month = JAN,
abstract = {The I/O access patterns of many parallel applications consist of accesses to a large number of small, noncontiguous pieces of data. If an application's I/O needs are met by making many small, distinct I/O requests, however, the I/O performance degrades drastically. To avoid this problem, MPI-IO allows users to access noncontiguous data with a single I/O function call, unlike in Unix I/O. In this paper, we explain how critical this feature of MPI-IO is for high performance and how it enables implementations to perform optimizations. We first provide a classification of the different ways of expressing an application's I/O needs in MPI-IO - we classify them into four levels, called levels 0-3. We demonstrate that, for applications with noncontiguous access patterns, the I/O performance improves dramatically if userswrite their applications to make level-3 requests (noncontiguous, collective) rather than level-0 requests (Unix style). We then describe how our MPI-IO implementation, ROMIO, delivers high performance for noncontiguous requests. We explain in detail the two key optimizations ROMIO performs: data sieving for noncontiguous requests from one process and collective I/O for noncontiguous requests from multiple processes. We describe how we have implemented these optimizations portably on multiple machines and file systems,controlled their memory requirements, and also achieved high performance. We demonstrate the performance and portability with performance results forthree applications - an astrophysics-application template (DIST3D), the NAS BTIO benchmark, and an unstructured code (UNSTRUC) - on five different parallel machines: HP Exemplar, IBM SP, Intel Paragon, NEC SX-4, and SGI Origin2000. }
}

@Article{hell02:mpi-impl,
author = {H. Hellwagner and M. Ohlenroth},
title = {{VI} architecture communication features and performance on the {G}iganet cluster {LAN}},
journal = {Future Generation Computer Systems},
year = 2002,
volume = 18,
number = 3,
pages = {421--433},
month = JAN,
abstract = {The virtual interface (VI) architecture standard was developed to satisfy the need for a high throughput, low latency communication system required for cluster computing. VI architecture aims to close the performance gap between the bandwidths and latencies provided by the communication hardware andvisible to the application, respectively, by minimizing the software overhead on the critical path of the communication. This paper presents the results of a performance study of one VI architecture hardware implementation, the Giganet cLAN (cluster LAN). The focus of the study is to assess and compare the performance of different VI architecture data transfer modes and specific features that are available to higher-level communication software like MPI in order to aid the implementor to decide which VI architecture options to employ for various communication scenarios. Examples of such options include the use of send/receive vs. RDMA data transfers, polling vs. blocking to check completion of communication operations, multiple VIs, completion queues and scatter capabilities of VI architecture. }
}

@Article{liLi01:mpi-app,
author = {Y. M. Li and J. L. Liu and T. S. Chao and S. M. Sze},
title = {A new parallel adaptive finite volume method for the numerical simulation of semiconductor devices},
journal = {Computer Physics Communications},
year = 2001,
volume = 142,
number = {1--3},
pages = {285--289},
month = DEC,
abstract = {Based on adaptive finite volume approximation, a posteriori error estimation, and monotone iteration, a novel system is proposed for parallel simulations of semiconductor devices. The system has two distinct parallel algorithms to perform a complete set of I-V simulations for any specific device model. The first algorithm is a domain decomposition on I-irregular unstructured meshes whereas the second is a parallelization of multiple I-V points. Implemented on a Linux cluster using message passing interface libraries, both algorithms are shown to have excellent balances on dynamic loading and hence result in efficient speedup. Compared with measurement data, computational results of sub-micron MOSFET devices are given to demonstrate the accuracy and efficiency of the system. }
}

@Article{iovi01:mpi-app,
author = {M. Iovieno and C. Cavazzoni and D. Tordella},
title = {A new technique for a parallel dealiased pseudospectral {N}avier-{S}tokes code},
journal = {Computer Physics Communications},
year = 2001,
volume = 141,
number = 3,
pages = {365--374},
month = DEC,
abstract = {A novel aspect of a parallel procedure for the numerical simulation of the solution of the Navier-Stokes equations through the Fourier-Galerkin pseudospectral method is presented. It consists of a dealiased ("3/2" rule) transposition of the data that organizes the computations in the distributed direction in such a way that whenever a Fast Fourier Transform must be calculated, the algorithm will employ data stored solely an the proper memory of the processor which is computing it. This provide for the employment of standard routines for the computations of the Fourier transform. The aliasing removal procedure has been directly inserted into the transposition algorithm. The code is written for distributed memory computers, but not specifically for a peculiar architecture. The use on a variety of machines is allowedby the adoption of the Message Passing Interface library. The portability of the code is demonstrated by the similar performances, in particular the high efficiency, that all the machines tested show up to a number of parallel processors equal to 1/2 the truncation parameter N/2. Explicit time integration is used. The present code organization is relevant to physical and mathematical problems which require a three dimensional spectral treatment.}
}

@Article{kepk01:mpi-app,
author = {A. Kepkep and U. Ravaioli and B. Winstead},
title = {Cluster-based parallel 3-{D} {M}onte {C}arlo device simulation},
journal = {VLSI Design},
year = 2001,
volume = 13,
number = {1--4},
pages = {51--56},
abstract = {The recent improvements in the performance of commodity computer have created very favorable conditions for building high performance parallel machines from computer clusters. These are very attractive for 3-D device simulation, necessary to model properly carrier-carrier interaction and granular doping effects in deeply scaled silicon devices. We have developed a parallel3-D Monte Carlo simulation environment customized for clusters using the Message Passing Library (MPI). The code has been tested on the supercluster of NCSA at the University of Illinois. We present here test results for an n-i-n diode structure, along with an analysis of performance for two different domain decomposition schemes.}
}

@Article{beck02:mpi-app,
author = {M. Be\v{c}ka and G. Ok\v{s}a and M. Vajter\v{s}ic},
title = {Dynamic ordering for a parallel block-Jacobi SVD algorithm},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 2,
pages = {243--262},
month = FEB,
abstract = {A new approach for the parallel computation of singular value decomposition(SVD) of matrix A is an element of C-mxn is proposed. Contrary to the known algorithms that use a static cyclic ordering of subproblems simultaneously solved in one iteration step, the proposed implementation of the two-sided block-Jacobi method uses a dynamic ordering of subproblems. The dynamic ordering takes into account the actual status of matrix A. In each iterationstep, a set of the off-diagonal blocks is determined that reduces the Frobenius norm of the off-diagonal elements of A as much as possible and, at the same time, can be annihilated concurrently. The solution of this task is equivalent to the solution of the maximum-weight perfect matching problem. The greedy algorithm for the efficient solution of this problem is presented. The computational experiments with both types of ordering, incorporated into the two-sided block-Jacobi method, were performed on an SGI - Cray Origin 2000 parallel computer using the Message Passing Interface (MPI). The results confirm. that the dynamic ordering is much more efficient with regard to the amount of work required for the computation of SVD of a given accuracy than the static cyclic ordering. }
}

@Article{lian02:mpi-app,
author = {Y. Liang and J. Weston and M. Szularz},
title = {Generalized least-squares polynomial preconditioners for symmetric indefinite linear equations},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 2,
pages = {323--341},
month = FEB,
abstract = {Polynomial preconditioners. are frequently used in a parallel environment for the computation of the solution of large-scale sparse linear equations (Ax = b) because of their easy implementation and trivial parallelization. With respect to symmetrical indefinite (SID) linear systems, the use of generalized least-squares (GLS) polynomial preconditioning is preferable to other polynomial preconditioning methods because of the ability to use a three-term recurrence relationship and the low implementation costs. The GLS preconditioning polynomial and its influence on the flexible generalized minimized residual (FGMRES) solver are discussed in this paper. The orthogonal polynomials required in the solution of the least-squares approximation problem are constructed using the Stieltjes procedure in multiple disjoint intervals which exclude the origin. The time-consuming numerical integration associated with this procedure is computed efficiently using Chebyshev polynomials of the first kind and the GLS polynomial reconditioned FGMRES algorithm is implemented using MPI in a highly parallel IBM SP2 environment. Experimental results using classical benchmark systems are presented and compared with those obtained using the recently developed SPAI preconditioned Bi-CGSTAB iterative method. The performance of the GLS preconditioned FGMRES solver is critically accessed.}
}

@Article{beka02:mpi-app,
author = {C. Bekas and E. Gallopoulos},
title = {Parallel computation of pseudospectra by fast descent},
journal = {Parallel Computing},
year = 2002,
volume = 28,
number = 2,
pages = {223--242},
month = FEB,
abstract = {The pseudospectrum descent method (PsDM) is proposed, a new parallel methodfor the computation of pseudospectra. The idea behind the method is to usepoints from an already existing pseudospectrum level curve partial derivativeA(epsilon), to generate in parallel the points of a new level curve partial derivativeA(delta) such that delta $<$ epsilon. This process can be continued for several steps to approximate several pseudospectrum level curves lying inside the original curve. It is showed via theoretical analysis and experimental evidence that PsDM is embarrassingly parallel, like GRID, and that it adjusts to the geometric characteristics of the pseudospectrum; in particular it captures disconnected components. Results obtained on a parallel system using MPI validate the theoretical analysis and demonstrate interesting load-balancing issues. }
}

@Article{jian02:mpi-app,
author = {D. Jiang and W. Meleis and M. El-Shenawee and E. Mizan and A. Ashouei and C. Rappaport},
title = {Parallel implementation of the steepest descent fast multipole method ({SDFMM}) on a {B}eowulf cluster for subsurface sensing applications},
journal = {IEEE Microwave and Wireless Components Letters},
year = 2002,
volume = 12,
number = 1,
pages = {24--26},
month = JAN,
abstract = {We present the parallel, MPI-based implementation of the SDFMM computer code using a thirty two-node Intel Pentium-based Beowulf cluster. The SDFMM isa fast algorithm that is a hybridization of the method of moments (MoMs), the fast multipole method (FMM), and the steepest descent integration path (SDP), which is used to solve large-scale linear systems of equations produced in electromagnetic scattering problems. An overall speedup of 7.2 has been achieved on the 32-processor Beowulf cluster and a significant reduced runtime is achieved on the 4-processor 667 MHz Alpha workstation.}
}


@Article{dehn02:mpi-app,
author = {F. Dehne and T. Eavis and S. Hambrusch and A. Rau-Chaplin},
title = {Parallelizing the data cube},
journal = {Distributed and Parallel Databases},
year = 2002,
volume = 11,
number = 2,
pages = {181--201},
month = MAR,
abstract = {We have implemented our parallel top-down data cube construction method in C++ with the MPI message passing library for communication and the LEDA library for the required graph algorithms. We tested our code on an eight processor cluster, using a variety of different data sets with a range of sizes, dimensions, density, and skew. Comparison tests were performed on a SunFire 6800. The tests show that our partitioning strategies generate a close to optimal load balance between processors. The actual run times observed show an optimal speedup of p.}
}

@Article{dewa02:mpi-app,
author = {Y. K. Dewaraja and M. Ljungberg and A. Majumdar and A. Bose and K. F. Koral},
title = {A parallel {M}onte {C}arlo code for planar and {SPECT} imaging: implementation, verification and applications in {I-131 SPECT}},
journal = {Computer Methods and Programs in Biomedicine},
year = 2002,
volume = 67,
number = 2,
pages = {115--124},
month = FEB,
abstract = {This paper reports the implementation of the SIMIND Monte Carlo code on an IBM SP2 distributed memory parallel computer. Basic aspects of running Monte Carlo particle transport calculations on parallel architectures are described. Our parallelization is based on equally partitioning photons among the processors and uses the Message Passing Interface (MPI) library for interprocessor communication and the Scalable Parallel Random Number Generator (SPRNG) to generate uncorrelated random number streams. These parallelization techniques are also applicable to other distributed memory architectures.A linear increase in computing speed with the number of processors is demonstrated for Lip to 32 processors. This speed-up is especially significant in Single Photon Emission Computed Tomography (SPECT) simulations involvinghigher energy photon emitters, where explicit modeling of the phantom and collimator is required. For I-131, the accuracy of the parallel code is demonstrated by comparing simulated and experimental SPECT images from a heart/thorax phantom. Clinically realistic SPECT simulations using the voxel-manphantom are carried out to assess scatter and attenuation correction. }
}

@Article{slot02:mpi-app,
author = {J. Slottow and A. Shahriari and M. Stein and X. Chen and C. Thomas and P. B. Ender},
title = {Instrumenting and tuning {dataView} - a networked application for navigating through large scientific datasets},
journal = {Software-Practice \& Experience},
year = 2002,
volume = 32,
number = 2,
pages = {165--190},
month = FEB,
abstract = {This paper describes how we instrumented and tuned the code for improved performance in a networked environment. We report on how we measured network performance, first by inducing network delay and then by running the dataView client component in Washington DC and the compute components in Los Angeles. We report on the effect that tile size, level of detail, and client CPU speed have on performance. We analyze what happens when the geometry computation is performed in parallel using MPI (Message Passing Interface) vs. in serial, and discuss the effect on performance of adding additional computational nodes. }
}

@Article{shan02:mpi-openmp,
author = {H. Z. Shan and J. P. Singh and L. Oliker and R. Biswas},
title = {A comparison of three programming models for adaptive applications on the {Origin2000}},
journal = {Journal of Parallel and Distributed Computing},
year = 2002,
volume = 62,
number = 2,
pages = {241--266},
month = FEB,
abstract = {Adaptive applications have computational workloads and communication patterns that change unpredictably at runtime, requiring dynamic load balancing to achieve scalable performance on parallel machines. Efficient parallel implementations of such adaptive applications is therefore a challenging task.In this paper, we compare the performance of and the programming effort required for two major classes of adaptive applications under three leading parallel programming models on an SGI Origin2000 system, a machine that supports all three models efficiently. Results indicate that the three models deliver comparable performance; however, the implementations differ significantly beyond merely using explicit messages versus implicit loads/stores even though the basic parallel algorithms are similar. Compared with the message-passing (using MPI) and SHMEM programming models, the cache-coherent shared address space (CC-SAS) model provides substantial ease of programming at both the conceptual and program orchestration levels, often accompanied by performance gains. However, CC-SAS currently has portability limitationsand may suffer from poor spatial locality of physically distributed shareddata on large numbers of processors.}
}

@Article{tan02:mpi-app,
author = {C. J. K. Tan},
title = {Solving systems of linear equations with relaxed {M}onte {C}arlo method},
journal = {Journal of Supercomputing},
year = 2002,
volume = 22,
number = 1,
pages = {111--123},
month = MAY,
abstract = {The problem of solving systems of linear algebraic equations by parallel Monte Carlo numerical methods is considered. A parallel Monte Carlo method with relaxation is presented. This is a report of a research in progress, showing the effectiveness of this algorithm. Theoretical justification of thisalgorithm and numerical experiments are presented. The algorithms were implemented on a cluster of workstations using MPI.}
}

@Article{chen01:mpi-app,
author = {D. Chen and T. Aoki and N. Homma and T. Higuchi},
title = {Pragmatic method for the design of fast constant-coefficient combinational multipliers},
journal = {IEEE Proceedings-Computers and Digital Techniques},
year = 2001,
volume = 148,
number = 6,
pages = {196--206},
month = NOV,
abstract = {To characterise and analyse the performance of evolutionary graph generation (EGG) on a cluster of PCs. a parallel version of the EGG system, called the distributed EGG (DEGG) system. has been developed using a message-passing interface (MPI). To demonstrate the capability of DEGG, it is applied to find the optimal design of various multipliers. Experimental results substantially clarify that the DEGG system consistently performs better than the EGG system. Moreover, the ability and solution quality of the DEGG system'ssearch can be further enhanced by the use of the self-adaptation mechanismof operator probabilities.}
}

@Article{marc02:mpi-app,
author = {C. D. Marcos and P. Barge and R. D. Marcos},
title = {Dust dynamics in protoplanetary disks: Parallel computing with {PVM}},
journal = {Journal of Computational Physics},
year = 2002,
volume = 176,
number = 2,
pages = {276--294},
month = MAR,
abstract = {We describe a parallel version of our high-order-accuracy particle-mesh code for the simulation of collisionless protoplanetary disks. We use this code to carry out a massively parallel, two-dimensional. time-dependent. numerical simulation, which includes dust Particles, to study the potential roleof large-scale, gaseous vortices in protoplanetary disks. This noncollisional problem is easy to parallelize oil message-passing multicomputer architectures. We performed file simulations oil a cache-coherent nonuniform memory access Origin 2000 machine. using both the parallel virtual machine (PVM) and message-passing interface (NIPI) message-passing libraries. Our performance analysis suggests that. for our problem, PVM is about 25\% faster than MPI. Using PVM and NIPI Made it possible to reduce CPU little and increase code performance. This allows for simulations with U large number of particle, (N similar to 10(5)-10(6)) in reasonable CPU times, The performances of our implementation of the parallel code on an Origin 2000 supercomputer are presented and discussed. Them exhibit very good speedup behavior and low load unbalancing. Our results confirm that giant gaseous vortices can play a dominant role in giant planet formation.}
}

@Article{ozyo02:mpi-app,
author = {Y. Ozyoruk},
title = {Parallel computation of forward radiated noise of ducted fans with acoustic treatment},
journal = {AIAA Journal},
year = 2002,
volume = 40,
number = 3,
pages = {450--455},
month = MAR,
abstract = {Forward radiated noise of ducted fans is computed numerically on parallel processors solving the three-dimensional, time-dependent Euler equations in body-conformed coordinates with a fourth-order-accurate, finite-difference,Runge-Kutta time-integration scheme. Sound attenuation effects of inlet wall acoustic treatment are included in computations employing a time-discrete form of the standard impedance condition. A distributed computing approach with domain decomposition is used for integrating the equations in parallel using the message passing interface library routines. The abilities of the method are demonstrated with hard- and soft-wall simulations of the JT15D inlet, including flow effects.}
}

@InProceedings{Jones97,
author = "Chris R. Jones and Ambuj K. Singh and Divyakant Agrawal",
title = "Low Latency {MPI} for Meiko {CS}/2 and {ATM} Clusters",
booktitle = "Proceedings of the 11th International Parallel Processing Symposium (IPPS'97)",
publisher = "The Institute of Electrical and Electronics Engineers",
address = "Geneva, Switzerland",
month = apr,
year = "1997",
keywords = "CD-ROM, I/O and Message Passing,",
abstract = "Contains a good overview of existing MPI implementations. Uses a Direct Memory Access method. In order to minimize latency: overlap the transfer of data and send envelope. And this only if the message size is above a certain threshold. First, sending and match envelopes, then DMA.",
}

@InProceedings{Dowd96,
author = "P. W. Dowd and T. M. Carrozzi and F. A. Pellegrino and A. X. Chen",
title = "Native {ATM} Application Programmer Interface Testbed for Cluster-Based Computing",
booktitle = "Proc. 10th Int. Parallel Processing Symp. (IPPS'96) CD-ROM",
publisher = "IEEE",
address = "Honolulu, HA",
month = apr,
year = "1996",
keywords = "Clusters and Domain Decomposition,",
}

@Article{Cotronis:1998:DMA,
author = "Y. Cotronis",
title = "Developing Message-Passing Applications on {MPICH} under Ensemble",
journal = "Lecture Notes in Computer Science",
volume = "1497",
pages = "145--??",
year = "1998",
coden = "LNCSD9",
ISSN = "0302-9743",
bibdate = "Tue Jan 5 08:21:58 MST 1999",
acknowledgement = ack-nhfb,
}

@InProceedings{Roy:2000:MGQ,
author = "Alain J. Roy and Ian Foster and William Gropp and Nicholas Karonis and Volker Sander and Brian Toonen",
title = "{MPICH-GQ}: Quality-of-Service for Message Passing Programs",
editor = "{ACM}",
booktitle = "{SC2000}: High Performance Networking and Computing. Dallas Convention Center, Dallas, {TX}, {USA}, November 4--10, 2000",
publisher = "ACM Press and IEEE Computer Society Press",
address = "New York, NY 10036, USA and 1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA",
pages = "54--54",
year = "2000",
bibdate = "Mon Feb 12 11:57:43 2001",
url = "http://www.sc2000.org/proceedings/techpapr/papers/pap234.pdf",
acknowledgement = ack-nhfb,
}


@InProceedings{IPDPS01*51,
author = "Olivier Aumage and Guillaume Mercier and Raymond Namyst",
title = "{MPICH/Madeleine}: a True {Multi-Protocol} {MPI} for High Performance Networks",
pages = "51--51",
booktitle = "Proceedings of the 15th International Parallel \& Distributed Processing Symposium ({IPDPS}-01)",
month = apr # " ~23--27",
publisher = "IEEE Computer Society",
address = "Los Alamitos, CA",
year = "2001",
}

@InProceedings{LINUX-00*353,
author = "Hong Ong and Paul A. Farrell",
title = "Performance Comparison of {LAM/MPI}, {MPICH}, and {MVICH} on a Linux Cluster Connected by a Gigabit Ethernet Network",
pages = "353--362",
booktitle = "Proceedings of the 4th Annual Showcase \& Conference ({LINUX}-00)",
month = oct # " ~10--14",
publisher = "The USENIX Association",
address = "Berkeley, CA",
year = "2000",
}

@Article{Gropp:1997:SMC,
author = "W. Gropp and E. Lusk",
title = "Sowing {MPICH}: {A} Case Study in the Dissemination of a Portable Environment for Parallel Scientific Computing",
journal = "The International Journal of Supercomputer Applications and High Performance Computing",
volume = "11",
number = "2",
pages = "103--114",
month = "Summer",
year = "1997",
coden = "IJSCFG",
ISSN = "1078-3482",
bibdate = "Thu Jun 26 18:17:48 1997",
acknowledgement = ack-nhfb,
}

@Article{Gropp97,
author = "William Gropp and Ewing Lusk",
title = "A high-performance {MPI} implementation on a shared-memory vector supercomputer",
journal = "Parallel Computing",
volume = "22",
number = "11",
pages = "1513--1526",
month = jan,
year = "1997",
keywords = "practical aspects/experiences; message-passing interface; shared memory multiprocessor; NEC SX-4; MPICH; performance; implementation;",
}

@InProceedings{Foster98a,
author = "Ian Foster and Nicholas T. Karonis",
title = "A Grid-Enabled {MPI}: Message Passing in Heterogeneous Distributed Computing Systems",
booktitle = "Proceedings of Supercomputing'98 (CD-ROM)",
publisher = "ACM SIGARCH and IEEE",
address = "Orlando, FL",
month = nov,
year = "1998",
keywords = "metacomputing, Message Passing Interface, MPI, Globus, computational grids, metacomputing, MPICH,",
abstract = "Application development for high-performance distributed computing systems, or computational grids as they are sometimes called, requires ``grid-enabled'' tools that hide mundane aspects of the heterogeneous grid environment without compromising performance. As part of an investigation of these issues, we have developed MPICH-G, a grid-enabled implementation of the Message Passing Interface (MPI) that allows a user to run MPI programs across multiple computers at different sites using the same commands that would be used on a parallel computer. This library extends the Argonne MPICH implementation of MPI to use services provided by the Globus grid toolkit. In this paper, we describe the MPICH-G implementation and present preliminary performance results.",
note = "Argonne National Laboratory",
}

@TechReport{Nog96,
author = "Saurab Nog and David Kotz",
title = "A Performance Comparison of {TCP}/{IP} and {MPI} on {FDDI}, Fast Ethernet, and Ethernet",
institution = "Dartmouth",
number = "PCS-TR95-273",
month = jan,
year = "1996",
keywords = "latency / bandwidth measurements for MPI/Ethernet,",
url = "http://www.cs.dartmouth.edu/reports/abstracts/PCS-TR95-273.html",
abstract = "Communication is a very important factor affecting distributed applications. Getting a close handle on network performance (both bandwidth and latency) is thus crucial to understanding overall application performance. We benchmarked some of the metrics of network performance using two sets of experiments, namely roundtrip and datahose. The tests were designed to measure a combination of network latency, bandwidth, and contention. We repeated the tests for two protocols (TCP/IP and MPI) and three networks (100 Mbit FDDI (Fiber Distributed Data Interface), 100 Mbit Fast Ethernet, and 10 Mbit Ethernet). The performance results provided interesting insights into the behavior of these networks under different load conditions and the software overheads associated with an MPI implementation (MPICH). This document presents details about the experiments, their results, and our analysis of the performance.\par Revised on 1/8/96 to emphasize our use of a particular MPI implementation, MPICH.",
}

@TechReport{ncstrl.cornell.tc//96-239,
type = "Technical Report",
number = "96-239",
title = "Multi{MATLAB}: {MATLAB} on Multiple Processors",
language = "English",
month = may,
notes = "PostScript",
pages = "16",
year = "1996",
bibdate = "May 30, 1996",
author = "Anne E. Trefethen and Vijay S. Menon and Chi-Chao Chang and Grezgorz J. Czajkowski and Chris Myers and Lloyd N. Trefethen",
abstract = "MATLAB(R), a commercial product of The MathWorks, Inc., has become one of the principal languages of desktop scientific computing. A system is described that enables one to run MATLAB conveniently on multiple processors. Using short, MATLAB-style commands like Eval, Send, Recv, Bcast, Min, and Sum, the user operating within one MATLAB session can start various processes in a fashion that maintains MATLAB's traditional user-friendliness. Multi-processor graphics is also supported. The system currently runs under MPICH on an IBM SP2 or a network of Unix workstations, and extensions are planned to networks of PCs. MultiMATLAB is potentially useful for education in parallel programming, for prototyping parallel algorithms, and for fast and convenient execution of easily parallelizable numerical computations on multiple processors.

Keywords: MATLAB, MultiMATLAB, SP2, message passing, MPI, MPICH",

institution = "Cornell Theory Center",
}

@InProceedings{Bhandarkar:1996:MPM,
author = "M. A. Bhandarkar and L. V. Kale",
title = "{MICE}: a prototype {MPI} implementation in {Converse} environment",
editor = "{IEEE}",
booktitle = "Proceedings. Second {MPI} Developer's Conference: Notre Dame, {IN}, {USA}, 1--2 July 1996",
publisher = "IEEE Computer Society Press",
address = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA",
year = "1996",
ISBN = "0-8186-7533-0",
pages = "26--31",
bibdate = "Sat Apr 19 16:34:54 MDT 1997",
acknowledgement = ack-nhfb,
classification = "C6110P (Parallel programming); C6115 (Programming support); C6150E (General utility programs); C6150N (Distributed systems software)",
conftitle = "Proceedings. Second MPI Developer's Conference",
corpsource = "Dept. of Comput. Sci., Illinois Univ., Urbana, IL, USA",
keywords = "Abstract Device Interface; application program interfaces; communication; computations; Converse interoperable parallel programming environment; message managers; message passing; MICE; MPI modules; MPICH; multi-threaded MPI programs; open systems; parallel programming; programming environments; prototype MPI implementation; public-domain MPI implementation; PVM interoperation; thread objects; utility programs",
sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process",
treatment = "P Practical",
}

@TechReport{UTEXAS_CS//CS-TR-95-22,
year = "1995",
type = "Technical Report",
number = "CS-TR-95-22",
institution = "University of Texas, Austin",
title = "Fast Collective Communication Libraries, Please",
bibdate = "November 24, 98",
url = "ftp://ftp.cs.utexas.edu/pub/techreports/tr95-22.ps.Z",
author = "Prasenjit Mitra and David Payne and Lance Shuler and Robert van de Geijn and Jerrell Watts",
abstract = "It has been recognized that many parallel numerical algorithms can be effectively implemented by formulating the required communication as collective communications. Nonetheless, the efficiency of such communications has been suboptimal in many communication library implementations. In this paper, we give a brief overview of techniques that can be used to implement a high performance collective communication library, the iCC library, developed for the Intel family of parallel supercomputers as part of the InterCom project at the University of Texas at Austin. We compare the achieved performance on the Intel Paragon to those of three widely available libraries: Intel's NX collective communication library, the MPICH Message Passing Interface (MPI) implementation developed at Argonne and Mississippi State University and a Basic Linear Algebra Communication Subprograms (BLACS) implementation, developed at the University of Tennessee.",
month = jun # " 1,",
}

@InProceedings{Skjellum:1996:TTM,
author = "A. Skjellum and B. Protopopov and S. Hebert",
title = "A thread taxonomy for {MPI}",
editor = "{IEEE}",
booktitle = "Proceedings. Second {MPI} Developer's Conference: Notre Dame, {IN}, {USA}, 1--2 July 1996",
publisher = "IEEE Computer Society Press",
address = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA",
year = "1996",
ISBN = "0-8186-7533-0",
pages = "50--57",
bibdate = "Sat Apr 19 16:34:54 MDT 1997",
acknowledgement = ack-nhfb,
classification = "C6110B (Software engineering techniques); C6110F (Formal methods); C6150E (General utility programs); C6150J (Operating systems); C6150N (Distributed systems software)",
conftitle = "Proceedings. Second MPI Developer's Conference",
corpsource = "Dept. of Comput. Sci., Mississippi State Univ., MS, USA",
keywords = "API extensions; application program interfaces; Channel Device; computational unit; fine-grain concurrency; formal specification; message passing; minimal portable thread management; MPI; MPICH; multi-threaded thread-safe ADI; non-thread-safe MPI call semantics; resource container; software portability; synchronisation; synchronization mechanisms; thread models; thread safety; thread taxonomy; user-level mechanism; utility programs; Windows NT version",
sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process",
treatment = "P Practical",
}

@Article{Foster:1997:MMC,
author = "Ian Foster and Jonathan Geisler and Carl Kesselman and Steven Tuecke",
title = "Managing Multiple Communication Methods in High-Performance Networked Computing Systems",
journal = "Journal of Parallel and Distributed Computing",
volume = "40",
number = "1",
pages = "35--48",
day = "10",
month = jan,
year = "1997",
coden = "JPDCER",
ISSN = "0743-7315",
bibdate = "Thu Mar 9 09:19:01 MST 2000",
url = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/ref",
acknowledgement = ack-nhfb,
classification = "B6150M (Protocols); B6210L (Computer communications); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing); C5640 (Protocols); C5670 (Network performance)",
corpsource = "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL, USA",
doi = "10.1006/jpdc.1996.1266",
keywords = "Argonne MPICH library; computer networks; computing systems; criteria; heterogeneous networked environment; high-performance networked; message passing; message passing interface; multimethod communication; multiple communication methods; multithreaded runtime system; networked computing environments; Nexus; Nexus-based MPI implementation; performance characteristics; performance evaluation; protocols; remote service request mechanisms; transport mechanisms; user-specified selection",
treatment = "P Practical",
}

@InProceedings{Foster:1996:MIW,
author = "I. Foster and J. Geisler and S. Tuecke",
title = "{MPI} on the {I-WAY}: a wide-area, multimethod implementation of the {Message Passing Interface}",
editor = "{IEEE}",
booktitle = "Proceedings. Second {MPI} Developer's Conference: Notre Dame, {IN}, {USA}, 1--2 July 1996",
publisher = "IEEE Computer Society Press",
address = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA",
year = "1996",
ISBN = "0-8186-7533-0",
pages = "10--17",
bibdate = "Sat Apr 19 16:34:54 MDT 1997",
acknowledgement = ack-nhfb,
classification = "C5620W (Other computer networks); C6110B (Software engineering techniques); C6115 (Programming support); C6130S (Data security); C6150E (General utility programs); C6150N (Distributed systems software)",
conftitle = "Proceedings. Second MPI Developer's Conference",
corpsource = "Argonne Nat. Lab., IL, USA",
keywords = "application program interfaces; authentication; automatic configuration mechanisms; communication mechanisms; geographically distributed computing resources; geographically distributed database resources; geographically distributed graphics resources; geographically distributed networking; heterogeneous systems; high-speed wide-area networks; I-WAY distributed- computing experiment; message authentication; message passing; Message Passing Interface; MPICH; Nexus multithreaded runtime system; parallel programming; portable high-performance programming model; process creation; programming environments; software environment; software libraries; utility programs; wide area networks",
sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process",
treatment = "P Practical",
}

@InProceedings{EVL-1999-99,
year = "1999",
title = "Numerical Relativity in a Distributed Environment",
author = "W. Benger and I. Foster and J. Novotny and E. Seidel and J. Shalf and W. Smith and P. Walker",
url = "http://visinfo.zib.de/EVlib/Show?EVL-1999-99",
abstract = "The Cactus parallel simulation framework provides a modular and extensible set of components for solving relativity problems on parallel computers. In recent work, we have investigated techniques that would enable the execution of Cactus applications in wide area {"}computational grid{"} environments. In a first study, we investigated the feasibility of distributing a single simulation across multiple supercomputers, while in a second we studied techniques for reducing communication costs associated with remote visualization and steering. Distributed simluation was achieved by using MPICH-G, an imlementation of the Message Passing Interface standard that uses mechanisms provided by the Globus grid toolkit to enable wide area execution. Experiments were performed across SGI Origins and Cray T3Es with geographical seperations ranging from hundreds of thousands of kilometres. Total execution time when distributed increased by between 18\% and 133\%, depending on configuration. We view these results as encouraging as they were obtained with essentially no specialized algorithmic structures in the Cactus application. Work on remote visualization focused on the development of a Cactus module that computes isosurfaces inline with numerical relativity calculations. Experiments demonstrated that this technique can reduce network bandwidth requirements by a factor ranging from 2.5 to 114, depending on the naturer of the problem.",
month = mar,
booktitle = "Proceedings of the Ninth SIAM Conference on Parallel Processing for Scientific Computing",
}

@Article{Gropp:1996:HPP,
author = "William Gropp and Ewing Lusk and Nathan Doss and Anthony Skjellum",
title = "High-performance, portable implementation of the {MPI} {Message Passing Interface Standard}",
journal = "Parallel Computing",
volume = "22",
number = "6",
pages = "789--828",
day = "20",
month = sep,
year = "1996",
coden = "PACOEJ",
ISSN = "0167-8191",
bibdate = "Fri Aug 6 10:15:01 MDT 1999",
url = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1996&;volume=22&issue=6&aid=1075",
acknowledgement = ack-nhfb,
affiliation = "Argonne Natl Lab",
affiliationaddress = "Argonne, IL, USA",
classification = "722.2; 722.4; 723; 723.1; 723.2; 902.2; C6110B (Software engineering techniques); C6110P (Parallel programming); C6115 (Programming support); C6150N (Distributed systems software)",
corpsource = "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL, USA",
journalabr = "Parallel Comput",
keywords = "applications; Computer programming; Computer software portability; Data communication systems; design goal; distribution; environments; free; future developments; high-performance portable implementation; Interfaces (computer); library writers; message passing; Message passing interface; MPI message; MPI-2; MPICH; parallel computer vendors; Parallel processing systems; parallel programming; Parallel programming environment; passing interface standard; portable parallel programming environment; programming; project management; software libraries; software performance evaluation; software portability; software standards; software tools; specialists; specification; standard library; Standards",
treatment = "P Practical",
}

@InProceedings{Husbands98,
author = "Parry J. Husbands and James C. Hoe",
title = "{MPI}-Star{T}: Delivering Network Performance to Numerical Applications",
booktitle = "Proceedings of Supercomputing'98 (CD-ROM)",
publisher = "ACM SIGARCH and IEEE",
address = "Orlando, FL",
month = nov,
year = "1998",
keywords = "networks, MPI, MPICH, MITMatlab, StarT-X, performance, clustering, SMP,",
abstract = "We describe an MPI implementation for a cluster of SMPs interconnected by a high-performance interconnect. This work is a collaboration between a numerical applications programmer and a cluster interconnect architect. The collaboration started with the modest goal of satisfying the communication needs of a specific numerical application, MITMatlab. However, by supporting the MPI standard MPI-StarT readily extends support to a host of applications. MPI-StarT is derived from MPICH by developing a custom implementation of the Channel Interface. Some changes in MPICH's ADI and Protocol Layers are also necessary for correct and optimal operation.\par MPI-StarT relies on the host SMPs' shared memory mechanism for intra-SMP communication. Inter-SMP communication is supported through StarT-X. The StarT-X NIU allows a cluster of PCI-equipped host platforms to communicate over the Arctic Switch Fabric. Currently, StarT-X is utilized by a cluster of SUN E5000 SMPs as well as a cluster of Intel Pentium-II workstations. On a SUN E5000 with StarT-X, a processor can send and receive a 64-byte message in less than 0.4 and 3.5 usec respectively and incur less than 5.6 usec user-to-user one-way latency. StarT-X's remote memory-to-memory DMA mechanism can transfer large data blocks at 60 MByte/sec between SUN E5000s.\par This paper outlines our effort to preserve and deliver this level of communication performance through MPI-StarT to user applications. We have studied the requirements of MITMatlab and the capabilities of StarT-X and have formulated an implementation strategy for the Channel Interface. In this paper, we discuss some performance and correctness issues and their resolutions in MPI-StarT. The correctness issues range from the handling of arbitrarily large message sizes to deadlock-free support of nonblocking MPI operations. Performance optimizations include a shared-memory-based transport mechanism for intra-SMP communication and a broadcast mechanism that is aware of the performance difference between intra-SMP and the slower inter-SMP communication.\par We characterize the performance of MPI-StarT on a cluster of SUN E5000s. On SUN E5000s, MPI processes within the same SMP can communicate at over 150 MByte/sec using shared memory. When communicating between SMPs over StarT-X, MPI-StarT has a peak bandwidth of 56 MByte/sec. While fine-tuning of MPI-StarT is ongoing, we demonstrate that MPI-StarT is effective in enabling the speedup of MITMatlab on a cluster of SMPs by reporting on the performance of some representative numerical operations.",
note = "Massachusetts Institute of Technology",
}

@TechReport{ncstrl.cornell.tc//95-228,
type = "Technical Report",
number = "95-228",
title = "{ARCH}, An Object-Oriented Library for Asynchronous and Loosely Synchronous System Programming",
language = "English",
month = dec,
notes = "PostScript",
pages = "144",
year = "1995",
bibdate = "December 28, 1995",
author = "Jean-Marc Adamo",
abstract = "ARCH is a C++-based library for asynchronous and loosely synchronous system programming. The current version offers a set of programming constructs that are outlined below:
  • Threads. The construct is presented as a class from which the user can derive his own classes. The class encapsulates a small set of status variables and offers a set of functions for declaration, initialization, scheduling, priority setting, yielding and stopping.
  • Processes. A process is a more regular and structured programming construct whose scheduling and termination obey additional synchronization rules. Together with the synchronous point-to-point communication system offered in the library (see below), processes favor a parallel programming style similar to OCCAM's (actually, an extension of it that removes most static features and allows processes to share data). The semantics of this model is well understood and will undoubtedly facilitate the development of correct large asynchronous code. The library has been designed so that the C++ compiler is able to check the static semantics of programs (complete type checking, send-recv correct matching, ...).
  • Synchronous communication. Threads and processes synchronize and communicate via communication channels. There are four types of communication channels for local or remote synchronization or synchronous point-to-point communication. Inter-processor channels are essentially tools for building virtual topologies. The channel classes offer functions to send to or receive from a channel and get the size of the latest received message. More specialized synchronization-communication tools can be derived from channels.
  • Global data and pointers. Beside threads, the library offers basic tools for developing distributed data abstractions. Global data are data that can be defined at given locations in the distributed memory but are visible from all processors. Global pointers are a generalization of C++ pointers that allow for addressing global data at any place over the distributed memory. As usual pointers, global pointers are subjected to arithmetic and logic manipulations (incrementation, dereferencing, indexing, comparision...). The library provides basic operators for global data and pointer definition.
  • Global read/write functions. Global pointer expressions provide global references over the distributed memory that can subsequently be used as arguments to global read/write functions. These functions allow the processors to get access to all global data regardless of their locations over the distributed memory. In their most complete form, the read/write functions operate as remote procedure calls. At the programmer's level, global read/write functions appear as {"}one-sided{"}: a read/write operation is executed on the processor that needs to read/write global data but need not be explicitly handled by the processor associated to the memory holding the data.
  • Spread and remote Arrays. Two basic distributed data structures have been built in the library. Spread arrays are arrays that have some of their dimensions spread over the distributed memory according to a given policy. Remote arrays are arrays that are defined at a given place in the distributed memory but can be accessed from any other. The spread and remote array classes (SpreadArray and RemoteArray) provide functions for global reference calculation. Global references can subsequently be used as arguments to global read/write functions. One can specialize global pointers to operate on spread or remote arrays. The global pointer class (Star class) offers distinct arithmetic and logic operator sets for unassigned, spread and remote global pointers.
The library encourages parallel code writing in a style that relies on the object-oriented approach: first, build the abstractions that the application at hand relies on; next, make an efficient implementation of the abstraction; and finally, develop the application on top of them. The abstractions can be distributed data types derived from those built in the library (spread and remote arrays: see code of the segmentation algorithm provided with the library) or new distributed types built in the same way or types reused from other applications. This approach should favor parallel code production with many desirable properties such as efficiency, portability, reusability, ... .

The library uses MPI as a communication interface. The current implementation runs on the IBM-SP2. Two versions of the library have currently been released. The first one is based on the IBM C++ compiler and MPI library. The second one makes use of the GNU g++ compiler and the MPICH public domain version of MPI. Porting the latter to any parallel machine supporting these two software systems should be straightforward.",

institution = "Cornell Theory Center",
}

@InProceedings{ThakurGroLus99,
author = "Rajeev Thakur and William Gropp and Ewing Lusk",
title = "Data Sieving and Collective {I/O} in {ROMIO}",
booktitle = "Proceedings of Frontiers '99: The 7th Symposium on the Frontiers of Massively Parallel Computation",
address = "Annapolis, Maryland",
organization = "IEEE Computer Society",
month = feb # " 21--25,",
year = "1999",
pages = "182--189",
}

@InProceedings{Thakur98,
author = "Rajeev S. Thakur and William Gropp and Ewing Lusk",
title = "A Case for Using {MPI}'s Derived Datatypes to Improve {I}/{O} Performance",
booktitle = "Proceedings of Supercomputing'98 (CD-ROM)",
publisher = "ACM SIGARCH and IEEE",
address = "Orlando, FL",
month = nov,
year = "1998",
keywords = "parallel I/O, MPI-IO,",
abstract = "MPI-IO, the I/O part of the MPI-2 standard, is a promising new interface for parallel I/O. A key feature of MPI-IO is that it allows users to access several noncontiguous pieces of data from a file with a single I/O function call by defining file views with derived datatypes. We explain how critical this feature is for high performance, why users must create and use derived datatypes whenever possible, and how it enables implementations to perform optimizations. In particular, we describe two optimizations our MPI-IO implementation, ROMIO, performs: data sieving and collective I/O. We demonstrate the performance and portability of the approach with performance results on five different parallel machines: HP Exemplar, IBM SP, Intel Paragon, NEC SX-4, and SGI Origin2000.",
note = "Argonne National Laboratory",
}

@InProceedings{thakur:mpi-io-implement,
author = "Rajeev Thakur and William Gropp and Ewing Lusk",
title = "On Implementing {MPI-IO} Portably and with High Performance",
booktitle = "Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems",
year = "1999",
month = may,
pages = "23--32",
earlier = "thakur:mpi-io-implement-tr",
url = "http://www.mcs.anl.gov/~thakur/papers/mpio-impl.ps",
keyword = "parallel I/O, multiprocessor file system interface, pario-bib",
abstract = "We discuss the issues involved in implementing MPI-IO portably on multiple machines and file systems and also achieving high performance. One way to implement MPI-IO portably is to implement it on top of the basic Unix I/O functions ({\tt open}, {\tt lseek}, {\tt read}, {\tt write}, and {\tt close}), which are themselves portable. We argue that this approach has limitations in both functionality and performance. We instead advocate an implementation approach that combines a large portion of portable code and a small portion of code that is optimized separately for different machines and file systems. We have used such an approach to develop a high-performance, portable MPI-IO implementation, called ROMIO. \par In addition to basic I/O functionality, we consider the issues of supporting other MPI-IO features, such as 64-bit file sizes, noncontiguous accesses, collective I/O, asynchronous I/O, consistency and atomicity semantics, user-supplied hints, shared file pointers, portable data representation, and file preallocation. We describe how we implemented each of these features on various machines and file systems. The machines we consider are the HP Exemplar, IBM SP, Intel Paragon, NEC SX-4, SGI Origin2000, and networks of workstations; and the file systems we consider are HP HFS, IBM PIOFS, Intel PFS, NEC SFS, SGI XFS, NFS, and any general Unix file system (UFS). \par We also present our thoughts on how a file system can be designed to better support MPI-IO. We provide a list of features desired from a file system that would help in implementing MPI-IO correctly and with high performance.",
}

@TechReport{ercim.inria.publications//RR-3461,
pages = "36 p.",
type = "Technical Report",
number = "RR-3461",
institution = "Inria, Institut National de Recherche en Informatique et en Automatique",
title = "Application Interfaces to {BPFS}: a Basic Parallel File System",
bibdate = "July 1, 1998",
author = "Robert D. Russell",
language = "A",
abstract = "Ce rapport d\écrit trois interfaces de programmation de BPFS, un syst\ème de fichiers distribu\é modulaire con\çu pour des grappes de stations de travail. Ces interfaces se nomment respectivement API0, CLI et MPI-IO. API0 est la premi\ère d\'une s\érie d\'interfaces d\'acc\ès \à BPFS de bas niveau. Cette interface est originale \à plusieurs titres : elle n\'ob\éit pas \à la philosophie classique des fichiers sous UNIX, elle op\ère en mode bloc et non en mode caract\ère, elle permet la lecture\/\écriture de tampons \«syst\èmes\» et de donn\ées utilisateurs et enfin elle est asynchrone. De plus, des op\érations de flux de donn\ée periodique ansi que le param\étrage des tampons du cot\é serveurs par les clients sont disponibles. Bien que l\'interface API0 puisse \être utilis\ée directement par n\'importe quelle application, deux interfaces de niveau sup\érieur ont \ét\é d\éfinies pour une utilisation plus ais\ée. CLI est une interface s\'appuyant sur API0 qui fournit les primitives standards d\'entr\ée\/sortie de la \«libc\». Ces primitives acc\èdent aux fichiers parall\èles g\ér\és par BPFS et non aux fichiers s\équentiels UNIX traditionnels. La troiseme interface est une version de l\'interface ROMIO (elle m\ême sous-ensemble de l\'interface standard MPI-IO) implant\ée au-dessus d\'API0. Cette interface permet donc aux applications d\évelopp\ées au-dessus de MPI de s\'executer sans modification au-dessus de BPFS. This report describes three application program interfaces to BPFS, a distributed, modular parallel file system designed for use on clusters of workstations. These interfaces are called API0, CLI, and MPI-IO. API0 is the first of an anticipated series of low-level, experimental client interfaces to BPFS. It is an \«unconventional\» interface in many respects: it is not particularly \«UNIX- like\», it is block-oriented rather than byte- oriented, it reads and writes system buffers as well as user-defined data areas, and it is asynchronous. It also provides time-regulated \«data streaming\» operations and user-level control of both server-side caching and per-file striping onto disks. Although API0 can be used directly from a user application program, it can also be used \«under\» a more conventional interface, as has been done for the next two interfaces. CLI is a \«C Library Interface\» implemented on top of API0 that exactly mimics the Standard C I\/O library interface, but accesses parallel files stored by BPFS rather than sequential files stored by the host file system. The third interface is the ROMIO version of the standard MPI-IO interface which has been implemented on top of API0 to support access to BPFS files from parallel programs that use the Message Passing Interface (MPI).",
}


@InProceedings{schmuck02:IO-GPFS,
author = {Frank Schmuck and Roger Haskin},
title = {{GPFS}: A Shared-Disk File System for Large Computing Clusters},
booktitle = {First Usenix Conference on File and Storage Technologies (FAST)},
year = 2002,
month = JAN,
annote = {Meeting held in Monteray, CA, January 28--30}
}


@Article{adve02:mpi-impl,
author = {V.S. Adve and R. Bagrodia and E. Deelman and R. Sakellariou},
title = {Compiler-optimized simulation of large-scale applications on high performance architectures},
journal = {Journal of Parallel and Distributed Computing},
year = 2002,
volume = 62,
number = 3,
pages = {393--426},
month = MAR,
abstract = {In this paper, we propose and evaluate practical, automatic techniques thatexploit compiler analysis to facilitate simulation of very large message-passing systems. We use compiler techniques and a compiler-synthesized static task graph model to identify the subset of the computations whose values have no significant effect on the performance of the program, and to generate symbolic estimates of the execution times of these computations. For programs with regular computation and communication patterns, this informationallows us to avoid executing or simulating large portions of the computational code during the simulation. It also allows us to avoid performing someof the message data transfers, while still simulating the message performance in detail. We have used these techniques to integrate the MPI-Sim parallel simulator at UCLA with the Rice dHPF compiler infrastructure. We evaluate the accuracy and benefits of these techniques for three standard message-passing benchmarks on a wide range of problem and system sizes. The optimized simulator has errors of less than 16\% compared with direct program measurement in all the cases we studied, and typically much smaller errors. Furthermore, it requires factors of 5 to 2000 less memory and up to a factor of 10 less time to execute than the original simulator. These dramatic savings allow us to simulate regular message-passing programs on systems and problem sizes 10 to 100 times larger than is possible with the original simulator, or other current state-of-the-art simulators. }
}


@Article{tana02:mpi-app,
author = {H. Tanaka and M. Takata and E. Nishibori and K. Kato and T. Iishi and M. Sakata},
title = {{ENIGMA}: maximum-entropy method program package for huge systems},
journal = {Journal of Applied Crystallography},
year = 2002,
volume = 35,
pages = {282--286},
month = APR,
abstract = {ENIGMA (Electron and Nuclear Image Generator by Max-ent Analysis) is a program package to evaluate three-dimensional electron and nuclear density fromX-ray and neutron diffraction data by using the maximum-entropy method (MEM). Compared with the previous program package MEED, ENIGMA saves computingtime and frees memory space at the same time by employing parallel data processing. The fast Fourier transformation (FFT) technique is also implemented. As a consequence of these improvements, the MEM analysis by ENIGMA becomes applicable to huge systems, such as proteins and polymers, when the phased structure factors are provided. The package is transferable to a wide variety of parallel computers, because it is written in Fortran 90 and a standard message-passing interface (MPI).}
}

@Article{bosi02:mpi-impl,
author = {G. Bosilca and G. Fedak and F. Cappello},
title = {{OVM}: Out-of-order execution parallel virtual machine},
journal = {Future Generation Computer Systems},
year = 2002,
volume = 18,
number = 4,
pages = {525--537},
month = MAR,
abstract = {High performance computing on parallel architectures currently uses different approaches depending on the hardware memory model of the architecture, the abstraction level of the programming environment and the nature of the application. In this article, we introduce an original client-server execution model based on RPCs called out-of-order parallel virtual machine (OVM). OVM aims to provide three main features: portability through a unique memory model, load-balancing using a plug-in support and high performance provided by several optimizations. The main optimizations are: non-blocking RPCs,data-flow management, persistent and non-persistent data, static data set distribution, dynamic scheduling and asynchronous global operations. We present OVM general architecture and demonstrate high performance for regular parallel applications, a parallel application with load balancing needs anda parallel application with real-time constraints. We firstly compare the performance of OVM and MPI for three kernels of the NAS 2.3. Then we illustrate the performance capability of OVM for a large real-life application that needs a load balancing support called AIRES. Finally, we present the performance of a real-time version of the PovRay ray-tracer demonstrating the reactiveness of OVM.}
}

@Article{lars02:mpi-app,
author = {D. J. Larson and J. S. Nasstrom},
title = {Shared- and distributed-memory parallelization of a {L}agrangian atmospheric dispersion model},
journal = {Atmospheric Environment},
year = 2002,
volume = 36,
number = 9,
pages = {1559--1564},
month = MAR,
abstract = {This paper describes parallelization of a 3-D Lagrangian stochastic atmospheric dispersion model using both distributed- and shared-memory methods. Shared-memory parallelism is implemented through the use of OpenMP compiler directives. Distributed-memory parallelism relies on the MPI message-passinglibrary. One or both (using MPI for inter-node and OpenMP for intra-node communication) of the parallel modes can be used depending upon the requirements of the problem and the computational platform available. The distributed-memory version achieves a nearly linear decrease in execution time as the, number of processors is increased. As the number of particles per processor is lowered, performance is limited by the decrease in work per processor and by the need to produce one set of output files. The shared-memory version achieves a speedup factor of similar to 1.4 running on machines with four processors.}
}

@Article{choe02:mpi-app,
author = {S. Choe and S. Muroya and A. Nakamura and C. Nonaka and T. Saito and R. Shoji},
title = {Lattice tool kit in {Fortran90}},
journal = {Nuclear Physics B-Proceedings Supplements},
year = 2002,
volume = 106,
pages = {1037--1039},
month = MAR,
abstract = {We report a project to provide a set of free source codes for lattice QCD. The programs may be used as fundamental blocks when one wants to construct his/her own QCD codes. They are written in Fortran 90 with use of MODULE, so that algorithms can transparently be seen. MPI is used for parallelization. We are also constructing a proto-type of QCD-GRID where one can try to run the code.}
}

@Article{bach02:mpi-app,
author = {F. W. Bach and H. Haferkamp and A. Kuhlmeyer and M. Niemeyer},
title = {Monte-Carlo simulation of dislocation networks using the message passing interface},
journal = {Modelling and Simulation in Materials Science and Engineering},
year = 2002,
volume = 10,
number = 2,
pages = {215--225},
month = MAR,
abstract = {The dynamic behaviour of one-dimensional lattice defects in metallic materials is responsible for most of the metals' mechanical properties. This behaviour is determined by all the interaction mechanisms of dislocations with other lattice defects such as further dislocations, precipitations and grain boundaries. A two-dimensional simulation model implementing these mechanisms based on a Monte-Carlo method for an N-body dislocation system is presented. The influences of temperature and external stress can also be taken into account. In this model, the different defects will be reduced to a polymorphic structure according to the object-oriented paradigm. To improve computing performance, the model was parallelized for multiprocessor machines in collaboration with the Edinburgh Parallel Computing Centre (EPCC) under the European Union TRACS programme (Training and Research on Advanced Computing Systems). Results are discussed under consideration of different parameters.}
}

@Article{bark02:mpi-app,
author = {K. Barker and N. Chrisochoides and J. Dobbelaere and D. Nave and K. Pingali},
title = {Data Movement and Control Substrate for parallel adaptive applications},
journal = {Concurrency and Computation-Practice \& Experience},
year = 2002,
volume = 14,
number = 2,
pages = {77--101},
month = FEB,
abstract = {In this paper, we present the Data Movement and Control Substrate (DMCS), alibrary which implements low-latency one-sided communication primitives for use in parallel adaptive and irregular applications. DMCS is built on topof low-level, vendor-specific communication subsystems such as LAPI (Low-level Application Programme Interface) for IBM SP machines, as well as on widely available message-passing libraries like MPI for clusters of workstations and PCs. DMCS adds a small overhead to the communication operations provided by the lower communication system. In return, DMCS provides a flexible and easy to understand application program interface for one-sided communication operations. Furthermore, DMCS Is designed so that it can be easily ported and maintained by non-experts.}
}

@Article{lei02:mpi-app,
author = {W. Lei and H. C. Yin and B. P. Wang and L. S. Tong},
title = {Optimization of a particle optical system in a mutilprocessor environment},
journal = {Nuclear Instruments \& Methods in Physics Research Section A-Accelerators Spectrometers Detectors and Associated Equipment},
year = 2002,
volume = 479,
number = {2--3},
pages = {611--617},
month = MAR,
abstract = {In the design of a charged particle optical system, many geometrical and electric parameters have to be optimized to improve the performance characteristics. In every optimization cycle, the electromagnetic field and particletrajectories have to be calculated. Therefore, the optimization of a charged particle optical system is limited by the computer resources seriously. Apart from this, numerical errors of calculation may also influence the convergence of merit function. This article studies how to improve the optimization of charged particle optical systems. A new method is used to determine the gradient matrix. With this method, the accuracy of the Jacobian matrix can be improved. In this paper, the charged particle optical system is optimized with a Message Passing Interface (MPI). The electromagnetic field, particle trajectories and gradients of optimization variables are calculated on networks of workstations. Therefore, the speed of optimization has been increased largely. It is possible to design a complicated charged particle optical system with optimum quality on a MPI environment. Finally, an electron gun for a cathode ray tube has been optimized on a MPI environment toverify the method proposed in this paper.}
}

@Article{khas02:mpi-app,
author = {S. A. Khashan and D. O. Ogbe and T. M. Jiang},
title = {Development and optimization of parallel code for large-scale petroleum reservoir simulation},
journal = {Journal of Canadian Petroleum Technology},
year = 2002,
volume = 41,
number = 4,
pages = {33--38},
month = APR,
abstract = {This paper discusses the use of large field-scale reservoir simulation to model multiphase fluid flow processes that occur in giant oil reservoirs. The goal of large-scale studies is to model fluid flow with sufficient details to account for reservoir heterogeneity, various reservoir-wellbore configurations, and complex fluid-rock interactions. In this paper, we developed a black-oil reservoir simulator for distributed-memory parallel environment. We ported serial code of a black-oil model to the CRAY T3E and IBM SP2 systems. We analysed the code and benchmarked the performance. To parallelizethe code, we used a domain decomposition algorithm, whereby the reservoir is divided into several subdomains, with each subdomain assigned to a separate processor element (PE). The message-passing interface (MPI) is used to exchange information across subdomains. We validated the parallel simulatorusing data from the Society of Petroleum Engineers comparative solution projects. Because the linear equation solver accounts for over 90\% of the CPUtime in a typical reservoir simulation run, we evaluated the performance of several parallel algorithms in the project, including LSOR, Gauss-Siedel (GS) and strongly implicit procedure (SIP). We found that the convergence behaviour of the solver depends on the number of processors and on the permeability anisotropy of the reservoir. For the problems we tested, the SIP algorithms provided the best performance. We compared the computational efficiency of the parallel code against the serial code using models containing up to 350,000 grid blocks in 4-, 8-, 16- 32-, 64-, and 80-PE environments. The paper discusses programming and computational performance issues in large-scale reservoir simulation for parallel systems.}
}


@Article{alia02a:mpi-app,
author = {S. Aliabadi and A. Johnson and B. Zellars and A. Abatan and C. Berger},
title = {Parallel simulation of flows in open channels},
journal = {Future Generation Computer Systems},
year = 2002,
volume = 18,
number = 5,
pages = {627--637},
month = APR,
abstract = {In this project, we apply our advanced free-surface flow solver to simulateflow in open channels at supercritical conditions. The finite element method is used to discretize the governing equations over fixed meshes. The finite element formulations have been implemented in parallel using message passing interface (MPI) libraries. Linear speed up performance is achieved. The computations are carried out for a case study involving flow in contraction channel at supercritical condition. The numerical results compare very well with experimental data.}
}


@Article{berz02:mpi-app,
author = {P. K. Berzigiyarov and V. A. Zayets and I. Y. Ginzburg and V. F. Razumov and E. F. Sheka},
title = {{NANOPACK}: Parallel codes for semiempirical quantum chemical calculations of large systems in the sp- and spd-basis},
journal = {International Journal of Quantum Chemistry},
year = 2002,
volume = 88,
number = 4,
pages = {449--462},
month = JUN,
abstract = {A parallel implementation of the conventionally used NDDO (MNDO, AM1, PM3, CLUSTER-Z1) and modified NDDO-WF (CLUSTER-Z2) techniques for semiempirical quantum chemical calculations of large molecular systems in the sp- and spd-basis, respectively, is described. The atom-pair distribution of data overprocessors forms the basis of the parallelization. The technological aspects of designing scalable parallel calculations on supercomputers (using ScaLAPACK and MPI libraries) are discussed. The scaling of individual algorithms and the entire package was carried out for model systems with 894, 1920,and 2014 atomic orbitals. The package speed-up provided by different multiprocessor systems involving a cluster of Intel PIII processors, Alpha-21264-processor-built machine MBC-1000M, and Cray-T3E is analyzed. The effect ofcomputer characteristics on the package performance is discussed. }
}

@Article{gosw02:mpi-use,
author = {D. Goswami and A. Singh and B. R. Preiss},
title = {From design patterns to parallel architectural skeletons},
journal = {Journal of Parallel and Distributed Computing},
year = 2002,
volume = 62,
number = 4,
pages = {669--695},
month = APR,
abstract = {The concept of design patterns has been extensively studied and applied in the context of object-oriented software design. Similar ideas are being explored in other areas of computing as well. Over the past several years, researchers have been experimenting with the feasibility of employing design-patterns related concepts in the parallel computing domain. In the past, several pattern-based systems have been developed with the intention to facilitate faster parallel application development through the use of preimplemented and reusable components that are based on frequently used parallel computing design patterns. However, most of these systems face several serious limitations such as limited flexibility, zero extensibility, and the ad hoenature of their components. Lack of flexibility in a parallel programming system limits a programmer to using only the high-level components providedby the system. Lack of extensibility here refers to the fact that most of the existing pattern-based parallel programming systems come with a set of prebuilt patterns integrated into the system. However, the system provides no obvious way of increasing the repertoire of patterns when need arises. Also, most of these systems do not offer any generic view of a parallel computing pattern, a fact which may be at the root of several of their shortcomings. This research proposes a generic (i.e., pattern- and application-independent) model for realizing and using parallel design patterns. The term "parallel architectural skeleton" is used to represent the set of generic attributes associated with a pattern. The Parallel Architectural Skeleton Model (PASM) is based on the message-passing paradigm, which makes it suitablefor a LAN of workstations and PCs. The model is flexible as it allows the intermixing of high-level patterns with low-level message-passing primitives. An object-oriented and library-based implementation of the model has been completed using C++ and MPI, without necessitating any language extension. The generic model and the library-based implementation allow new patternsto be defined and included into the system. The skeleton-library serves asa framework for the systematic, hierarchical development of network-oriented parallel applications.}
}


@Article{anku02:mpi-app,
author = {A. L. Ankudinov and C. E. Bouldin and J. J. Rehr and J. Sims and H. Hung},
title = {Parallel calculation of electron multiple scattering using {L}anczos algorithms},
journal = {Physical Review B},
year = 2002,
volume = 6510,
number = 10,
pages = {4107--4107},
month = MAR,
abstract = {Real space multiple scattering calculations of the electronic density of states and x-ray spectra in solids typically scale as the cube of the system and basis set size, and hence are highly demanding computationally. For example, such x-ray absorption near edge structure (XANES) calculations typically require clusters of order N-R atoms and s, p, and d states for convergence, with N-R between about 10(2)-10(3); for this case about 10(2) inversions of 9N(R) x 9N(R) matrices are needed, one for each energy point. We discuss here two ways to speed up these calculations: (1) message passing interface (MPI) parallel processing and (2) fast, Lanczos multiple scattering algorithms. Together these algorithms can reduce computation times typically by two orders of magnitude. These are both implemented in a generalization of the ab initio self-consistent FEFF8 code, which thus makes practical XANES calculations in complex systems with of order 10(3) atoms. The Lanczos algorithm also yields a natural crossover between full and finite-order multiple scattering with increasing energy, thus differentiating the extended and near-edge regimes.}
}

@Article{gall02:mpi-app,
author = {J. A. Gallud and J. Garcia-Consuegra and A. Martinez},
title = {Distributed processing of remotely sensed {Landsat-TM} imagery using {MPI}},
journal = {Cluster Computing},
year = 2002,
pages = {15-22}
}

@Article{alia02:mpi-app,
author = {S. Aliabadi and A. Abatan and A. Johnson and J. Abedi and Y. Yeboah and K. Bota},
title = {Stabilized finite element formulation of buoyancy driven incompressible flows},
journal = {Communications in Numerical Methods in Engineering},
year = 2002,
volume = 18,
number = 5,
pages = {315--324},
month = MAY,
abstract = {Streamline-upwind/Petrov-Galerkin finite element method is developed for buoyancy-driven incompressible flows with heat and mass transfer. The stabilized finite element formulations are implemented in parallel using message passing interface libraries. To measure the accuracy of the method, we solvea 2D numerical example of natural convection flows at moderate to high Rayleigh numbers. The 3D applications include the dispersion of smoke from a chimney and within a stadium.}
}

@Article{mo02:mpi-app,
author = {Z. Y. Mo and J. L. Zhang and Q. D. Cai},
title = {Dynamic load balancing for short-range parallel molecular dynamics simulations},
journal = {International Journal of Computer Mathematics},
year = 2002,
volume = 79,
number = 2,
pages = {165--177},
month = FEB,
abstract = {The iterative Multilevel Averaging Weight (MAW) algorithm presented in paper [1] is modified to solve the dynamic load imbalance problems arising fromthe two-dimensional short-range parallel molecular dynamics simulations inthis paper. Firstly, five types of load balancing models are given which allows detailed studies of the algorithm. In particular, it shows that for strip decomposition, the number of iteration needs for the system to converge from an initially unbalanced state to a well balanced state is bounded by2logP, where P is the number of processors. This result can permit the algorithm to efficiently track fluctuations in the molecular density as the simulation progresses, and is much better than that of the Cellular AutomatonDiffusion (CAD) scheme presented in paper [2]. Secondly, we apply MAW algorithm to solve the load imbalance problem in the parallel molecular dynamics simulation for higher speed wall collisions. At last, the numerical experimental results and parallel computing performance with MPI-1.2 under a PC-Cluster consists of 64 Pentium-III 500 MHz nodes connected by 100 Mbps switches are given in this paper.}
}


@Article{deit02:parallel-lang,
author = {S. J. Deitz and B. L. Chamberlain and L. Snyder},
title = {High-level language support for user-defined reductions},
journal = {Journal of Supercomputing},
year = 2002,
volume = 23,
number = 1,
pages = {23--37},
month = AUG,
abstract = {The optimized handling of reductions on parallel supercomputers or clustersof workstations is critical to high performance because reductions are common in scientific codes and a potential source of bottlenecks. Yet in many high-level languages, a mechanism for writing efficient reductions remains surprisingly absent. Further, when such mechanisms do exist, they often do not provide the flexibility a programmer needs to achieve a desirable level of performance. In this paper, we present a new language construct for arbitrary reductions that lets a programmer achieve a level of performance equal to that achievable with the highly flexible, but low-level combination of Fortran and MPI. We have implemented this construct in the ZPL language and evaluate it in the context of the initialization of the NAS MG benchmark. We show a 45 times speedup over the same code written in ZPL without thisconstruct. In addition, performance on a large number of processors surpasses that achieved in the NAS implementation showing that our mechanism provides programmers with the needed flexibility.}
}


@Article{mohr02:mpi-openmp,
author = {B. Mohr and A. D. Malony and S. Shende and F. Wolf},
title = {Design and prototype of a performance tool interface for {OpenMP}},
journal = {Journal of Supercomputing},
year = 2002,
volume = 23,
number = 1,
pages = {105--128},
month = AUG,
abstract = {This paper proposes a performance tools interface for OpenMP, similar in spirit to the MPI profiling interface in its intent to define a clear and portable API that makes OpenMP execution events visible to runtime performancetools. We present our design using a source-level instrumentation approachbased on OpenMP directive rewriting. Rules to instrument each directive and their combination are applied to generate calls to the interface consistent with directive semantics and to pass context information (e.g., source code locations) in a portable and efficient way. Our proposed OpenMP performance API further allows user functions and arbitrary code regions to be marked and performance measurement to be controlled using new OpenMP directives. To prototype the proposed OpenMP performance interface, we have developed compatible performance libraries for the Expert automatic event trace analyzer [17, 18] and the TAU performance analysis framework [13]. The directive instrumentation transformations we define are implemented in a source-to-source translation tool called OPARI. Application examples are presented for both Expert and TAU to show the OpenMP performance interface and OPARI instrumentation tool in operation. When used together with the MPI profilinginterface (as the examples also demonstrate), our proposed approach provides a portable and robust solution to performance analysis of OpenMP and mixed-mode (OpenMP+MPI) applications.}
}

@Article{nak02:mpi-app,
author = {K. Nakajima and H. Okuda},
title = {Parallel iterative solvers for unstructured grids using a directive/{MPI} hybrid programming model for the {GeoFEM} platform on {SMP} cluster},
journal = {Concurrency and Computation-Practice \& Experience},
year = 2002,
volume = 14,
number = {6--7},
pages = {411-429},
month = {May-June},
abstract = {In this paper, an efficient parallel iterative method for unstructured grids developed by the authors for shared memory symmetric multiprocessor (SMP)cluster architectures on the GeoFEM platform is presented. The method is based on a three-level hybrid parallel programming model, including message passing for inter-SMP node communication, loop directives for intra-SMP node parallelization and vectorization for each processing element (PE). Simple 3D elastic linear problems with more than 108 degrees of freedom have been solved by 3 x 3 block ICCG(0) with additive Schwarz domain decomposition and PDJDS/CM-RCM reordering on 16 SMP nodes of a Hitachi SR8000 parallel computer, achieving a performance of 20 Gflops. The PDJDS/CM-RCM reordering method provides excellent vector and parallel performance in SMP nodes, and is essential for parallelization of forward/backward substitution in IC/ILUfactorization with global data dependency. The method developed was also tested on an NEC SX-4 and attained 969 Mflops (48.5\% of peak performance) using a single processor. The additive Schwarz domain decomposition method provides robustness for the GeoFEM parallel iterative solvers with localized preconditioning.}
}


@Article{tri02:mpi-app,
author = {N. Trivedi and J. Bischof and S. Davis and K. Pedretti and T. E. Scheetz and T. A. Braun and C. A. Roberts and N. L. Robinson and V. C. Sheffield and A. B. Soares and T. L Casavant},
title = {Parallel creation of non-redundant gene indices from partial {mRNA} transcripts},
journal = {Future Generation Computer Systems},
year = 2002,
volume = 18,
number = 6,
pages = {863--870},
month = MAY,
abstract = {This paper describes the UIcluster software tool, which partitions expressed sequence tag (EST) sequences and other genetic sequences into "clusters" based on sequence similarity. Ideally, each cluster will contain sequences that all represent the same gene. UIcluster has been developed over the course of 4 years to solve this problem efficiently and accurately for large data sets consisting of tens or hundreds of thousands of EST sequences. The latest version of the application has been parallelized using the MPI standard. Both the computation and memory requirements of the program can be distributed among multiple (possibly distributed) UNIX processes.}
}


@Article{stan02:mpi-model,
author = {N. Stankovic and K. Zhang},
title = {A distributed parallel programming framework},
journal = {IEEE Transactions on Software Engineering},
year = 2002,
volume = 28,
number = 5,
pages = {478--493},
month = MAY,
abstract = {This paper presents Visper, a novel object-oriented framework that identifies and enhances common services and programming primitives, and implements a generic set of classes applicable to multiple programming models in a distributed environment. Groups of objects, which can be programmed in a uniform and transparent manner, and agent-based distributed system management, are also featured in Visper. A prototype system is designed and implemented in Java, with a number of visual utilities that facilitate program development and portability, As a use case, Visper integrates parallel programming in an MPI-like message-passing paradigm at a high level with services such as checkpointing and fault tolerance at a lower level. The paper reports a range of performance evaluation on the prototype and compares it to relatedworks.}
}

@Article{ong02:mpi-impl,
author = {E. Ong},
title = {MPI ruby: Scripting in a parallel environment},
journal = {Computing in Science \& Engineering},
year = 2002,
volume = 4,
number = 4,
pages = {78--82},
month = {July-Aug}
}

@Article{tad02:mpi-app,
author = {M. Tadjfar and R. Himeno},
title = {Time-accurate, parallel, multi-zone, multi-block solver to study the human cardio-vascular system},
journal = {Biorheology},
year = 2002,
volume = 39,
number = {3--4},
pages = {379--384},
abstract = {A parallel, time-accurate flow solver is devised to study the human cardio-vascular system. The solver is capable of dealing with moving boundaries and moving grids. It is designed to handle complex, three-dimensional vascular systems. The computational domain is divided into multiple block subdomains. At each cross section the plane is divided into twelve sub-zones to allow flexibility for handling complex geometries and, if needed, appropriate parallel data partitioning. The unsteady, three-dimensional, incompressibleNavier-Stokes equations are solved numerically. A second-order in time andthird-order upwind finite volume method for solving time-accurate incompressible flows based on pseudo-compressibility and dual time-stepping technique is used. For parallel execution, the flow domain is partitioned. Communication between the subdomains of the flow on Riken's VPP/700E supercomputeris implemented using MPI message-passing library. A series of numerical simulations of biologically relevant flows is used to validate this code.}
}


@Article{anon02:mpi-models,
author = {Anonymous},
title = {Message passing},
journal = {Parallel and Distributed Computing: A Survey of Models, Paradigms and Approaches},
year = 2002,
pages = {95--109}
}


@Article{sei02:mpi-tools,
author = {F. J. Seinstra and D. Koelma},
title = {{P-3PC}: {A} point-to-point communication model for automatic and optimal decomposition of regular domain problems},
journal = {IEEE Transactions on Parallel and Distributed Systems},
year = 2002,
volume = 13,
number = 7,
pages = {758--768},
month = JUL,
abstract = {One of the most fundamental problems automatic parallelization tools are confronted with is to find an optimal domain decomposition for a given application. For regular domain problems (such as simple matrix manipulations), this task may seem trivial. However, communication costs in message passing programs often significantly depend on the memory layout of data blocks to be transmitted. As a consequence, straightforward domain decompositions maybe nonoptimal. In this paper, we introduce a new point-to-point communication model (called P-3PC, or the "Parameterized model based on the Three Paths of Communication") that is specifically designed to overcome this problem. In comparison with related models (e.g., LogGP) P-3PC is similar in complexity, but more accurate in many situations. Although the model is aimed at MPI's standard point-to-point operations, it is applicable to similar message passing definitions as well. The effectiveness of the model is tested in a framework for automatic parallelization of low level image processing applications. Experiments are performed on two Beowulf-type systems, each having a different interconnection network, and a different MPI implementation. Results show that, where other models frequently fail, P-3PC correctly predicts the communication costs related to any type of domain decomposition.}
}

@Article{vit02:mpi-app,
author = {J. E. Vitela and U. R. Hanebutte and J. L. Gordillo and L. M. Cortina},
title = {Comparative performance study of parallel programming models in a neural network training code},
journal = {International Journal of Modern Physics C},
year = 2002,
volume = 13,
number = 4,
pages = {429--452},
month = MAY,
abstract = {This paper discusses the performance studies of a coarse grained parallel neural network training code for control of nonlinear dynamical systems, implemented in the shared memory and message passing parallel programming environments OpenMP and MPI, respectively. In addition, these codes are compared to an implementation utilizing SHMEM the native data passing SGI/Cray environment for parallel programming. The multiprocessor platform used in the study is a SGI/Cray Origin 2000 with up to 32 processors, which supports all these programming models efficiently. The dynamical system used in this study is a nonlinear OD model of a thermonuclear fusion reactor with the EDA-ITER design parameters. The results show that OpenMP outperforms the othertwo environments when large number of processors are involved, while yielding a similar or a slightly poorer behavior for small number of processors.As expected the native SGI/Cray environment outperforms MPI for the entirerange of processors used. Reasons for the observed performance are given. The parallel efficiency of the code is always greater than 60\% regardless of the parallel environment for the range of processors used in this study.}
}

@Article{wan02:mpi-app,
author = {Y. Wang and A. M. Cuitino},
title = {Full-field measurements of heterogeneous deformation patterns on polymeric foams using digital image correlation},
journal = {International Journal of Solids and Structures},
year = 2002,
volume = 39,
number = {13--14},
pages = {3777--3796},
month = {June-July},
abstract = {The ability of a digital image correlation technique to capture the heterogeneous deformation fields appearing during compression of ultra-light open-cell foams is presented in this article. Quantitative characterization of these fields is of importance to understand the mechanical properties of thecollapse process and the energy dissipation patterns in this type of materials. The present algorithm is formulated in the context of multi-variable non-linear optimization where a merit function based on a local average of the deformation mapping is minimized implicitly. A parallel implementation utilizing message passing interface for distributed-memory architectures isalso discussed. Estimates for optimal size of the correlation window basedon measurement accuracy and spatial resolution are provided. This technique is employed to reveal the evolution of the deformation texture on the surface of open-cell polyurethane foam samples of different relative densities. Histograms of the evolution of surface deformation are extracted, showingthe transition from unimodal to bimodal and back to unimodal. These results support the interpretation that the collapse of light open-cell foams occurs as a phase transition phenomenon.}
}

@Article{mor02:mpi-app,
author = {H. Moritsch and S. Benkner},
title = {High-performance numerical pricing methods},
journal = {Concurrency and Computation-Practice \& Experience},
year = 2002,
volume = 14,
number = {8--9},
pages = {665--678},
month = {July-August},
abstract = {The pricing of financial derivatives is an important field in finance and constitutes a major component of financial management applications. The uncertainty of future events often makes analytic approaches infeasible and, hence, time-consuming numerical simulations are required, In the Aurora Financial Management System, pricing is performed on the basis of lattice representations of stochastic multidimensional scenario processes using the MonteCarlo simulation and Backward Induction methods, the latter allowing for the exploitation of shared-memory parallelism. We present the parallelization of a Backward Induction numerical pricing kernel on a cluster of SMPs using HPF+, an extended version of High-Performance Fortran. Based on languageextensions for specifying a hierarchical mapping of data onto an SMP cluster, the compiler generates a hybrid-parallel program combining distributed-memory and shared-memory parallelism. We outline the parallelization strategy adopted by the VFC compiler and present an experimental evaluation of the pricing kernel on an NEC SX-5 vector supercomputer and a Linux SMP cluster, comparing a pure MPI version to a hybrid-parallel MPI/OpenMP version.}
}

@Article{deK02:mpi-app,
author = {J. de Kloe and A. van der Steen and H. Oksuzoglu and H. Dijkstra},
title = {A fully implicit parallel ocean model using {MUMPS}},
journal = {Journal of Supercomputing},
year = 2002,
volume = 23,
number = 2,
pages = {167--183},
month = SEP,
abstract = {The formulation, implementation and performance of a new fully implicit parallel model of the ocean circulation is presented. Within this model, steady states can be traced in one of the control parameters. In addition, transient flows can be computed using relatively (compared to traditional ocean models) large time steps such that long integration times can be reached. The discretized equations of the ocean model are solved by the Newton-Raphson technique and the emerging linear systems are solved by a (MPI) version of the MUltifrontal Massively Parallel Solver. The performance of the code on an SGI Origin 2000 platform is presented here using typical results for asector ocean flow.}
}


@Article{Oku:mpi-hpf-app,
author = {H. Okuda and N. Anan},
title = {Optimization of element-by-element {FEM} in {HPF} 1.1},
journal = {Concurrency and Computation-Practice \& Experience},
year = 2002,
volume = 14,
number = {8--9},
pages = {647--663},
month = {July-August},
abstract = {In this study, Poisson's equation is numerically evaluated by the element-by-element (EBE) finite-element method in a parallel environment using HPF 1.1 (High-Performance Fortran). In order to achieve high parallel efficiency, the data structures have been altered to node-based data instead of mixtures of node- and element-based data, representing a node-based EBE finite-element scheme (nEBE). The parallel machine used in this study was the NEC SX-4, and experiments were performed on a single node having 32 processors sharing common memory. The HPF compiler used in the experiments is HPF/SX Rev 2.0 released in 1997 (unofficial), which supports HPF 1.1. Models containing approximately 200000 and 1500000 degrees of freedom were analyzed in order to evaluate the method. The calculation time, parallel efficiency, and memory used were compared. The performance of HPF in the conjugate gradientsolver for the large model, using the NEC SX-4 compiler option -noshrunk, was about 85\% that of the message passing interface.}
}

@Article{eke02:mpi-app,
author = {T. Ekevid and N. E. Wiberg},
title = {A comparison of parallel implementation of explicit {DG} and central difference method},
journal = {Communications in Numerical Methods in Engineering},
year = 2002,
volume = 18,
number = 8,
pages = {585--597},
month = AUG,
abstract = {Massive parallel computers have become more attractive for advanced numerical simulations since standard libraries for communication and synchronization; for example MPI have facilitated program development. The present paperdiscusses two parallel explicit time integration methods for wave propagation problems; the central difference method and the explicit version of thePI-PI discontinuous galerkin (DG) method. Based on the MIMD model, where data decomposition is accomplished by element-based grid partitioning, parallel versions of both algorithms have been implemented, using the same paradigm for inter-process communication. Numerical examples are illustrated and the achieved performance of the algorithms is discussed.}
}

@Article{LOlik2002,
author = "L. Oliker and X. Y. Li and P. Husbands and R. Biswas",
title = "Effects of ordering strategies and programming paradigms on sparse matrix computations",
journal = "SIAM Review",
volume = "44",
number = "3",
pages = "373--393",
month = SEP,
year = "2002",
abstract = "The conjugate gradient (CG) algorithm is perhaps the best-known iterative technique for solving sparse linear systems that are symmetric and positive definite. For systems that are ill conditioned, it is often necessary to use a preconditioning technique. In this paper, we investigate the effects ofvarious ordering and partitioning strategies on the performance of parallel CG and ILU(0) preconditioned CG (PCG) using different programming paradigms and architectures. Results show that for this class of applications, ordering significantly improves overall performance on both distributed and distributed shared-memory systems, cache reuse may be more important than reducing communication, it is possible to achieve message-passing performance using shared-memory constructs through careful data ordering and distribution, and a hybrid MPI+OpenMP paradigm increases programming complexity with little performance gain. A multithreaded implementation of CG on the Cray NITA does not require special ordering or partitioning to obtain high efficiency and scalability, giving it a distinct advantage for adaptive applications; however, it shows limited scalability for PCG due to a lack of thread-level parallelism."
}
@Article{SEMin2002,
author = "S. E. Minkoff",
title = "Spatial parallelism of a 3{D} finite difference velocity-stress elastic wave propagation code",
journal = "SIAM Journal on Scientific Computing",
volume = "24",
number = "1",
pages = "1--19",
month = AUG,
year = "2002",
abstract = "In three-dimensional isotropic elastic earth, the wave equation solution consists of three velocity components and six stresses. We discretize the partial derivatives using second order in time and fourth order in space staggered finite difference operators. The parallel implementation uses the message passing interface library for platform portability and spatial decomposition for efficiency. Most of the communication in the code consists of passing subdomain face information to neighboring processors. When the parallel communication is balanced against computation by allocating subdomains ofreasonable size, we observe excellent scaled speedup. Allocating subdomains of size 25 x 25 x 25 on each node, we achieve efficiencies of 94\\% on 128 processors of an Intel Paragon."
}
@Article{JKlei2002,
author = "J. Kleinjung and N. Douglas and J. Heringa",
title = "Parallelized multiple alignment",
journal = "Bioinformatics",
volume = "18",
number = "9",
pages = "1270--1271",
month = SEP,
year = "2002",
abstract = "Multiple sequence alignment is a frequently used technique for analyzing sequence relationships. Compilation of large alignments is computationally expensive, but processing time can be considerably reduced when the computational load is distributed over many processors. Parallel processing functionality in the form of single-instruction multiple-data (SIMD) technology wasimplemented into the multiple alignment program Praline by using 'message passing interface' (MPI) routines. Over the alignments tested here, the parallelized program performed up to ten times faster on 25 processors compared to the single processor version."
}
@Article{AAfsa2002,
author = "A. Afsahi and N. J. Dimopoulos",
title = "Efficient communication using message prediction for clusters of multiprocessors",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "14",
number = "10",
pages = "859--883",
month = AUG,
year = "2002",
abstract = "With the increasing uniprocessor and symmetric multiprocessor computationalpower available today, interprocessor communication has become an important factor that limits the performance of clusters of workstations/multiprocessors. Many factors including communication hardware overhead, communication software overhead, and the user environment overhead (multithreading, multiuser) affect the performance of the communication subsystems in such systems. A significant portion of the software communication overhead belongs to a number of message copying operations. Ideally, it is desirable to have a true zero-copy protocol where the message is moved directly from the sendbuffer in its user space to the receive buffer in the destination without any intermediate buffering. However, due to the fact that message-passing applications at the send side do not know the final receive buffer addresses, early arrival messages have to be buffered at a temporary area. In this paper, we show that there is a message reception communication locality in message-passing applications. We have utilized this communication locality and devised different message predictors at the receiver sides of communications. In essence, these message predictors can be efficiently used to drainthe network and cache the incoming messages even if the corresponding receive calls have not yet been posted. The performance of these predictors, interms of hit ratio, on some parallel applications are quite promising and suggest that prediction has the potential to eliminate most of the remaining message copies. We also show that the proposed predictors do not have sensitivity to the starting message reception call, and that they perform better than (or at least equal to) our previously proposed predictors. "
}
@Article{TBohl2002,
author = "T. Bohlen",
title = "Parallel 3-{D} viscoelastic finite difference seismic modelling",
journal = "Computers \& Geosciences",
volume = "28",
number = "8",
pages = "887--899",
month = OCT,
year = "2002",
abstract = "Computational power has advanced to a state where we can begin to perform wavefield simulations for realistic (complex) 3-D earth models at frequencies of interest to both seismologists and engineers. On serial platforms however, 3-D calculations are still limited to small grid sizes and short seismic wave traveltimes. To make use of the efficiency of network computers a parallel 3-D viscoelastic finite difference (FD) code is implemented which allows to distribute the work on several PCs or workstations connected via standard ethernet in an in-house network. By using the portable message passing interface standard (MPI) for the communication between processors, running times can be reduced and grid sizes can be increased significantly. Furthermore, the code shows good performance on massive parallel supercomputers which makes the computation of very large grids feasible. This implementation greatly expands the applicability of the 3-D elastic/viscoelastic finite-difference modelling technique by providing an efficient, portable and practical C-program. "
}
@Article{GRLue2002,
author = "G. R. Luecke and Y. Zou and J. Coyle and J. Hoekstra and M. Kraeva",
title = "Deadlock detection in {MPI} programs",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "14",
number = "11",
pages = "911--932",
month = SEP,
year = "2002",
abstract = "The Message-Passing Interface (MPI) is commonly used to write parallel programs for distributed memory parallel computers. MPI-CHECK is a tool developed to aid in the debugging of MPI programs that are written in free or fixed format Fortran 90 and Fortran 77. This paper presents the methods used inMPI-CHECK 2.0 to detect many situations where actual and potential deadlocks occur when using blocking and non-blocking point-to-point routines as well as when using collective routines."
}
@Article{ARMRa2002,
author = "A. R. M. Rao",
title = "A parallel mixed time integration algorithm for nonlinear dynamic analysis",
journal = "Advances in Engineering Software",
volume = "33",
number = "5",
pages = "261--271",
month = MAY,
year = "2002",
abstract = "This paper presents a parallel mixed time integration algorithm formulated by synthesising the implicit and explicit time integration techniques. The proposed algorithm is an extension of the mixed time integration algorithms[Comput. Meth. Appl. Mech. Engng 17/18 (1979) 259; Int. J. Numer. Meth. Engng 12 (1978) 1575] being successfully employed for solving media-structureinteraction problems. The parallel algorithm for nonlinear dynamic response of structures employing mixed time integration technique has been devisedwithin the broad framework of domain decomposition. Concurrency is introduced into this algorithm, by integrating interface nodes with explicit time integration technique and later solving the local submeshes with implicit algorithm. A flexible parallel data structure has been devised to implement the parallel mixed time integration algorithm. Parallel finite element codehas been developed using portable Message Passing Interface software development environment. Numerical studies have been conducted on PARAM-10000 (Indian parallel supercomputer) to test the accuracy and also the performanceof the proposed algorithm. Numerical studies indicate that the proposed algorithm is highly adaptive for parallel processing. "
}
@Article{PMieh2002,
author = "P. Miehe and A. Sandu and G. R. Carmichael and Y. H. Tang and D. Daescu",
title = "A communication library for the parallelization of air quality models on structured grids",
journal = "Atmospheric Environment",
volume = "36",
number = "24",
pages = "3917--3930",
month = AUG,
year = "2002",
abstract = "PAQMSG is an MPI-based, Fortran 90 communication library for the parallelization of air quality models (AQMs) on structured grids. It consists of distribution, gathering and repartitioning routines for different domain decompositions implementing a master-worker strategy. The library is architectureand application independent and includes optimization strategies for different architectures. This paper presents the library from a user perspective. Results are shown from the parallelization of STEM-III on Beowulf clusters. The PAQMSG library is available on the web. The communication routines are easy to use, and should allow for an immediate parallelization of existing AQMs. PAQMSG can also be used for constructing new models. "
}
@Article{YLian2002,
author = "Y. Li and S. M. Sze and T. S. Chao",
title = "A practical implementation of parallel dynamic load balancing for adaptive computing in {VLSI} device simulation",
journal = "Engineering with Computers",
volume = "18",
number = "2",
pages = "124--137",
month = "",
year = "2002",
abstract = "We present a new parallel semiconductor device simulation using the dynamicload balancing approach. This semiconductor device simulation based on theadaptive finite volume method with a posteriori error estimation has been developed and successfully implemented on a 16-PC Linux cluster with a message passing interface library. A constructive monotone iterative technique is also applied for solution of the system of nonlinear algebraic equations. Two different parallel versions of the algorithm to perform a complete device simulation are proposed. The first is a dynamic parallel domain decomposition approach, and the second is a parallel current-voltage characteristic points simulation. This implementation shows that a well-designed load balancing simulation can significantly reduce the execution time up to an order of magnitude. Compared with the measured data, numerical results on various submicron VLSI devices are presented, to show the accuracy and efficiency of the method."
}
@Article{WHLiu2002,
author = "W. H. Liu and C. L. Wang and V. K. Prasanna",
title = "Portable and scalable algorithm for irregular all-to-all communication",
journal = "Journal of Parallel and Distributed Computing",
volume = "62",
number = "10",
pages = "1493--1526",
month = OCT,
year = "2002",
abstract = "In irregular all-to-all communication, messages are exchanged between everypair of processors. The message sizes vary from processor to processor andare known only at run time. This is a fundamental communication primitive in parallelizing irregularly structured scientific computations. Our algorithm reduces the total number of message start-ups. It also reduces node contention by smoothing out the lengths of the messages communicated. As compared to the earlier approaches, our algorithm provides deterministic performance and also reduces the buffer space at the nodes during message passing,The performance of the algorithm is characterised using a simple communication model of high-performance computing (HPC) platforms. We show the implementation on T3D and SP2 using C and the message passing interface standard. These can be easily ported to other HPC platforms. The results show the effectiveness of the proposed technique as well as the interplay among the machine size, the variance in message length, and the network interface. "
}
@Article{CRDow2002,
author = "C. R. Dow and J. S. Chen and M. C. Hsieh",
title = "Checkpointing {MPI} applications on symmetric multi-processor machines using {SMPC}kpt",
journal = "Journal of Systems and Software",
volume = "63",
number = "2",
pages = "137--150",
month = AUG,
year = "2002",
abstract = "Researchers from many different areas have requirements for computational power to solve their specific problems. Symmetric multi-processor (SMP) machines are also widely available and their processing capacity is in demand particularly for applications in areas such as virtual reality and multimedia. Checkpointing provides the backbone for rollback recovery (fault-tolerance), playback debugging, process migration and job swapping. Numerous checkpointing tools have been designed and implemented but few are based on SMP machines for MPI applications. This work designs, develops, and implements SMPCkpt, a checkpointing system for symmetric multi-processor environments.SMPCkpt supports a range of facilities, including transparent checkpointing, fault detection, and rollback recovery. Two coordinated checkpointing algorithms, barrier and non-barrier, are developed and implemented in SMPCkptthat can be used to reduce the execution down time in the presence of failures."
}
@Article{GEFag2002,
author = "G. E. Fagg and J. J. Dongarra",
title = "H{ARNESS} fault tolerant {MPI} design, usage and performance issues",
journal = "Future Generation Computer Systems",
volume = "18",
number = "8",
pages = "1127--1142",
month = OCT,
year = "2002",
abstract = "Initial versions of MPI were designed to work efficiently on multi-processors which had very little job control and thus static process models. Subsequently forcing them to support a dynamic process model suitable for use on clusters or distributed systems would have reduced their performance. As current HPC collaborative applications increase in size and distribution the potential levels of node and network failures increase. This is especially true when MPI implementations are used as the communication media for GRID applications where the GRID architectures themselves are inherently unreliable thus requiring new fault tolerant MPI systems to be developed. Here we present a new implementation of MPI called FT-MPI that allows the semanticsand associated modes of failures to be explicitly controlled by an application via a modified MPI API. Given is an overview of the FT-MPI semantics, design, example applications and some performance issues such as efficient group communications and complex data handling. Also briefly described is the HARNESS g-hcore system that handles low-level system operations on behalf of the MPI implementation. This includes details of plug-in services developed and their interaction with the FT-MPI runtime library. "
}
@Article{SBolu2002,
author = "S. Boluriaan and P. J. Morris",
title = "Two-dimensional simulations of wake vortex detection using radio acoustic sounding systems",
journal = "AIAA Journal",
volume = "40",
number = "11",
pages = "2247--2256",
month = NOV,
year = "2002",
abstract = "A parallel code is developed to simulate numerically wake vortex detection using a radio acoustic sounding system (RASS). The code is written in FORTRAN 90 with the message passing interface for parallel implementation. The numerical simulation solves simultaneously the linearized Euler equations for a nonuniform mean flow and the Maxwell equations for a nonhomogeneous medium. The radar transmitter and receiver antennas are modeled using an arrayof point sources and a beam-forming technique, respectively. Many featuresof the RASS are explored using the numerical simulation. First, a uniform mean flow is considered, and the RASS simulation is performed for two different types of incident acoustic field: a short single-frequency acoustic pulse and a continuous broadband acoustic source. Both monostatic and bistatic configurations are examined, and their results are compared. Taylor and Oseen vortex velocity profiles are used as sample models, and their mean flowfields are reconstructed from the backscattered electromagnetic signal using the Abel transform. The effect of radar beam width is also considered, as are the issues of nonaxisymmetric and interacting vortices."
}
@Article{KEkic2002,
author = "K. Ekici and A. S. Lyrintzis",
title = "Parallelization of rotorcraft aerodynamics {N}avier-{S}tokes codes",
journal = "AIAA Journal",
volume = "40",
number = "5",
pages = "887--896",
month = MAY,
year = "2002",
abstract = "The modification of unsteady three-dimensional Navier-Stokes codes for application on massively parallel and distributed computing environments is investigated. Previously, the Euler mode of the Navier-Stokes code TURNS has been parallelized. For the efficient implementation of the Navier-Stokes mode of TURNS on parallel computing systems, several algorithmic changes should be made. The main modification is done on the implicit operator, lower-upper symmetric Gauss-Seidel. Two new implicit operators are used because of convergence problems of traditional operators with high cell aspect ratio grids needed for viscous calculations. Results for Navier-Stokes cases are presented for various operators. The message passing interface protocol is used because of its portability to various parallel architectures."
}
@Article{PHave2002,
author = "P. Have",
title = "Easy{MSG}: {T}ools and techniques for an adaptive overlapping in {SPMD} programming",
journal = "Esaim-Mathematical Modelling and Numerical Analysis-Modelisation Mathematique Et Analyse Numerique",
volume = "36",
number = "5",
pages = "863--882",
month = SEP-OCT,
year = "2002",
abstract = "During the development of a parallel solver for Maxwell equations by integral formulations and Fast Multipole Method (FMM), we needed to optimize a critical part including a lot of communications and computations. Generally, many parallel programs need to communicate, but choosing explicitly the wayand the instant may decrease the efficiency of the overall program. So, the overlapping of computations and communications may be a way to reduce this drawback. We will see a implementation of this techniques using dynamic and adaptive overlapping based on the EasyMSG high level C++ library over MPI, a case of SPMD programming."
}
@Article{VESon2002,
author = "V. E. Sonzogni and A. M. Yommi and N. M. Nigro and M. A. Storti",
title = "A parallel finite element program on a {B}eowulf cluster",
journal = "Advances in Engineering Software",
volume = "33",
number = "7-10",
pages = "427--443",
month = JUL-OCT,
year = "2002",
abstract = "Some experiences on writing a parallel finite element code on a Beowulf cluster are shown. This cluster is made up of seven Pentium III processors connected by Fast Ethernet. The code was written in C++ making use of MPI as message passing library and parallel extensible toolkit for scientific computations. The code presented here is a general framework where specific applications may be written. In particular CFD applications regarding Laplace equations, Navier-Stokes and shallow water flows have been implemented. The parallel performance of this application code is assessed and several numerical results are presented. "
}
@Article{GEise2002,
author = "G. Eisenhauer and F. E. Bustamante and K. Schwan",
title = "Native data representation: {A}n efficient wire format for high-performance distributed computing",
journal = "IEEE Transactions on Parallel and Distributed Systems",
volume = "13",
number = "12",
pages = "1234--1246",
month = DEC,
year = "2002",
abstract = "New trends in high-performance software development such as took and component-based approaches have increased the need for flexible and high-performance communication systems. When trying to reap the well-known benefits of these approaches, the question of what communication infrastructure should be used to link the various components arises. In this context, flexibility and high-performance seem to be incompatible goals. Traditional HPC-style communication libraries, such as MPI, offer good performance, but are not intended for loosely-coupled systems. Object- and metadata-based approaches like XML offer the needed plug-and-play flexibility, but with significantly lower performance. We observe that the flexibility and baseline performanceof data exchange systems are strongly determined by their wire formats, orby how they represent data for transmission in heterogeneous environments.After examining the performance implications of using a number of different wire formats, we propose an alternative approach for flexible high-performance data exchange, Native Data Representation, and evaluate its current implementation in the Portable Binary I/O library."
}
@Article{MFeil2002,
author = "M. Feil and A. Uhl",
title = "Wavelet packet image decomposition on {MIMD} architectures",
journal = "Real-Time Imaging",
volume = "8",
number = "5",
pages = "399--412",
month = OCT,
year = "2002",
abstract = "In this work, we describe and analyze algorithms for 2D wavelet packet (WP)decomposition for multicomputers and multiprocessors. In the case of multicomputers, the main goal is the generalization of former parallel WP algorithms which are constrained to a number of processor elements equal to a power of 4. For multiprocessors, we discuss several optimizations of shared-memory algorithms and finally we compare the results obtained on multicomputers and multi-processors employing the message passing (MPI) and shared-memory programming (OpenMP) paradigm, respectively."
}
@Article{MSala2002,
author = "M. Sala",
title = "An algebraic 2-level domain decomposition preconditioner with applications to the compressible {E}uler equations",
journal = "International Journal for Numerical Methods in Fluids",
volume = "40",
number = "12",
pages = "1551--1560",
month = DEC,
year = "2002",
abstract = "Two possible schemes to introduce the coarse grid operator will be described. Both cases have been implemented and tested in a distributed parallel environment, using the MPI library. It will be shown that for suitable valuesof the rank of the coarse grid operator it is possible to obtain a considerable reduction in the number of iterations compared to the Schwarz preconditioner without coarse operator. "
}
@Article{SBenk2003,
author = "S. Benkner and V. Sipkova",
title = "Exploiting distributed-memory and shared-memory parallelism on clusters of {SMP}s with data parallel programs",
journal = "International Journal of Parallel Programming",
volume = "31",
number = "1",
pages = "3--19",
month = FEB,
year = "2003",
abstract = "Clusters of SMPs are hybrid-parallel architectures that combine the main concepts of distributed-memory and shared-memory parallel machines. Although SMP clusters are widely used in the high performance computing community, there exists no single programming paradigm that allows exploiting the hierarchical structure of these machines. Most parallel applications deployed onSMP clusters are based on MPI, the standard API for distributed-memory parallel programming, and thus may miss a number of optimization opportunitiesoffered by the shared memory available within SMP nodes. In this paper we present extensions to the data parallel programming language HPF and associated compilation techniques for optimizing HPF programs on clusters of SMPs. The proposed extensions enable programmers to control key aspects of distributed-memory and shared-memory parallelization at a high-level of abstraction. Based on these language extensions, a compiler can adopt a hybrid parallelization strategy which closely reflects the hierarchical structure of SMP clusters by automatically exploiting shared-memory parallelism based onOpenMP within cluster nodes and distributed-memory parallelism utilizing MPI across nodes. We describe the implementation of these features in the VFC compiler and present experimental results which show the effectiveness ofthese techniques."
}
@Article{MSMul2002,
author = "M. S. Muller and E. Gabriel and M. M. Resch",
title = "A software development environment for {G}rid computing",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "14",
number = "13-15",
pages = "1543--1551",
month = NOV-DEC,
year = "2002",
abstract = "Grid computing has become a popular concept in the last few years. While inthe beginning the driving force was metacomputing, the focus has now shifted towards resource management issues and concepts like ubiquitous computing. For the High-Performance Computing Center Stuttgart (HLRS) the key challenges of Grid computing have come from the demands of its users and customers. With high-speed networks in place, programmers expect to be able to exploit the overall performance of several instruments and highspeed systems for their applications. In order to meet these demands, HLRS has set out a research effort to provide these users with the necessary tools to develop and run their codes on clusters of supercomputers. This has resulted in the development of a basic Grid-computing environment for technical and scientific computing. In this paper we describe the building blocks of this software development environment and focus specifically on communication and debugging. We present the Grid-enabled MPI implementation PACX-MPI and the MPI debugger MARMOT. "
}
@Article{GMahi2002,
author = "G. Mahinthakumar and F. Saied",
title = "A hybrid {MPI}-{O}pen{MP} implementation of an implicit finite-element code on parallel architectures",
journal = "International Journal of High Performance Computing Applications",
volume = "16",
number = "4",
pages = "371--393",
month = WIN,
year = "2002",
abstract = "The hybrid MPI-OpenMP model is a natural parallel programming paradigm for emerging parallel architectures that are based on symmetric multiprocessor (SMP) clusters. This paper presents a hybrid implementation adapted for an implicit finite-element code developed for groundwater transport simulations. The original code was parallelized for distributed memory architectures using MPI (Message Passing Interface) using a domain decomposition strategy. OpenMP directives were then added to the code (a straightforward loop-level implementation) to use multiple threads within each MPI process. To improve the OpenMP performance, several loop modifications were adopted. The parallel performance results are compared for four modern parallel architectures. The results show that for most of the cases tested, the pure MPI approach outperforms the hybrid model. The exceptions to this observation were mainly due to a limitation in the MPI library implementation on one of the architectures. A general conclusion is that while the hybrid model is a promising approach for SMP cluster architectures, at the time of this writing, the payoff may not be justified for converting all existing MPI codes to hybrid codes. However, improvements in OpenMP compilers combined with potential MPI limitations in SMP nodes may make the hybrid approach more attractive for a broader set of applications in the future."
}
@Article{DJMav2002,
author = "D. J. Mavriplis",
title = "Parallel performance investigations of an unstructured mesh {N}avier-{S}tokes solver",
journal = "International Journal of High Performance Computing Applications",
volume = "16",
number = "4",
pages = "395--407",
month = WIN,
year = "2002",
abstract = "The implementation and performance of a hybrid OpenMP/MPI parallel communication strategy for an unstructured mesh computational fluid dynamics code is described. The solver is cache efficient and fully vectorizable, and is parallelized using a two-level hybrid MPI-OpenMP implementation suitable forshared and/or distributed memory architectures, as well as clusters of shared memory machines. Parallelism is obtained through domain decomposition for both communication models. Single processor computational rates as well as scalability curves are given on various architectures. For the architectures studied in this work, the OpenMP or hybrid OpenMP/MPI communication strategies achieved no appreciable performance benefit over an exclusive MPI communication strategy."
}
@Article{IAhma2003,
author = "I. Ahmad",
title = "H{ARD}: {A} hypercube embedding algorithm for state assignment of finite state machines",
journal = "Computers \& Electrical Engineering",
volume = "29",
number = "2",
pages = "327--356",
month = MAR,
year = "2003",
abstract = "To minimize the area of the combinational circuit, required to realize a finite state machine (FSM), an efficient assignment of states of the FSM to aset of binary codes is required. As to find an optimal state assignment isNP-hard, therefore heuristic approaches have been taken. One approach generates an adjacency graph from the FSM model and then tries to embed the adjacency graph onto a hypercube with an objective to minimize the cost of mapping. However, hypercube embedding itself is an NP-complete problem. In this paper we present a solution to the hypercube embedding problem by designing a new technique, designated as HARD, that is a hybrid combination of non-linear programming method and a local search. We have transformed our problem from discrete space to continuous space and have applied logarithmic barrier function method, that in turn uses gradient projection approach to minimize the objective function. Each iteration of the gradient projection method produces a valid solution. Local search is performed around solution to improve its quality by using a Kernighan-Lin style algorithm. Two distributed algorithms for the HARD, have also been designed and implemented on network of workstations under message passing interface, to speed up the search. We have carried out a large number of experiments to deter-mine the efficiency of the HARD in terms of solution quality over many other techniques, and have obtained very promising results. "
}
@Article{AJGar2003,
author = "A. J. Garcia-Loureiro and J. M. Lopez-Gonzalez and T. F. Pena",
title = "A parallel 3{D} semiconductor device simulator for gradual heterojunction bipolar transistors",
journal = "International Journal of Numerical Modelling-Electronic Networks Devices and Fields",
volume = "16",
number = "1",
pages = "53--66",
month = JAN-FEB,
year = "2003",
abstract = "In this paper, we present a parallel three-dimensional semiconductor devicesimulator for gradual heterojunction bipolar transistor. This simulator uses the drift-diffusion transport model. The Poisson equation and continuityequations were discretized using a finite element method (FEM) on an unstructured tetrahedral mesh. Fermi-Dirac statistics is considered in our modeland a compact formulation is used that makes it easy to take into account other effects such as the non-parabolic nature of the bands or the presenceof various subbands in the conduction process. Domain decomposition methods were tested to solve the linear systems. We have applied this simulator to a gradual heterojunction bipolar transistor (HBT), and we present some measures of the parallel execution time for several solvers and some electrical results. This code has been implemented for distributed memory multicomputers, making use of the MPI message passing standard library and a parallel solver library. "
}
@Article{HZSha2003,
author = "H. Z. Shan and J. P. Singh and L. Oliker and R. Biswas",
title = "Message passing and shared address space parallelism on an {SMP} cluster",
journal = "Parallel Computing",
volume = "29",
number = "2",
pages = "167--186",
month = FEB,
year = "2003",
abstract = "Currently, message passing (MP) and shared address space (SAS) are the two leading parallel programming paradigms. MP has been standardized with MPI, and is the more common and mature approach; however, code development can be extremely difficult, especially for irregularly structured computations. SAS offers substantial ease of programming, but may suffer from performancelimitations due to poor spatial locality and high protocol overhead. In this paper, we compare the performance of and the programming effort requiredfor six applications under both programming models on a 32-processor PC-SMP cluster, a platform that is becoming increasingly attractive for high-endscientific computing. Our application suite consists of codes that typically do not exhibit scalable performance under shared-memory programming due to their high communication-to-computation ratios and/or complex communication patterns. Results indicate that SAS can achieve about half the parallelefficiency of MPI for most of our applications, while being competitive for the others. A hybrid MPI + SAS strategy shows only a small performance advantage over pure MPI in some cases. Finally, improved implementations of two MPI collective operations on PC-SMP clusters are presented. "
}
@Article{DBLei2003,
author = "D. B. Leineweber and A. Schafer and H. G. Bock and J. P. Schloder",
title = "An efficient multiple shooting based reduced {SQP} strategy for large-scale dynamic process optimization - {P}art {II}: {S}oftware aspects and applications",
journal = "Computers \& Chemical Engineering",
volume = "27",
number = "2",
pages = "167--174",
month = FEB,
year = "2003",
abstract = "As model based optimization techniques play a more and more important role in the chemical process industries, there is a great demand for ever more efficient and reliable process optimization software. In the first part of this paper, the theoretical aspects of a tailored multiple shooting based solution strategy for dynamic process optimization have been presented (Leineweber, Bauer, Bock & Schloder, 2002. An efficient multiple shooting based reduced SQP strategy for large-scale dynamic process optimization-part 1: theoretical aspects). The current second part describes software aspects of the specific implementation muscod-ii and provides numerical results for several application examples. muscod-ii has been coupled with the dynamic process modeling software gPROMS via the standard equation set object (ESO) interface of CAPE-OPEN. Thereby, an advanced dynamic optimization platform forintegrated batch processes has been created, where each process stage is separately modeled in gPROMS, and the multistage dynamic optimization problem is assembled and solved with MUSCOD-II. The code has also been parallelized. based on the portable MPI standard. It is shown that the use of directional sensitivities becomes very important for larger problems with many algebraic variables, leading to drastically reduced computing times compared with strategies with complete constraint linearization. In addition, gPROMS ESO models are compared with classical Fortran models in terms of computational performance, and it is found that only a moderate loss of performance occurs if so-called in-process ESOs are employed. Finally, it is demonstrated that a significant speed-up can be obtained through parallel function and gradient evaluations. "
}
@Article{GLeuc2003,
author = "G. Leucke and H. Chen and J. Coyle and J. Hoekstra and M. Kraeva and Y. Zou",
title = "M{PI}-{CHECK}: a tool for checking {F}ortran 90 {MPI} programs",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "15",
number = "2",
pages = "93--100",
month = FEB,
year = "2003",
abstract = "MPI is commonly used to write parallel programs for distributed memory parallel computers. MPI-CHECK is a tool developed to aid in the debugging of MPI programs that are written in free or fixed format Fortran 90 and Fortran 77. MPI-CHECK provides automatic compile-time and run-time checking of MPI programs. MPI-CHECK automatically detects the following problems in the useof MPI routines: (i) mismatch in argument type, kind, rank or number; (ii)messages which exceed the bounds of the source/destination array; (iii) negative message lengths; (iv) illegal MPI calls before MPI-INIT or after MPI_FINALIZE; (v) inconsistencies between the declared type of a message and its associated DATATYPE argument; and (vi) actual arguments which violate the INTENT attribute."
}
@Article{ILirk2003,
author = "I. Lirkov",
title = "M{PI} solver for 3{D} elasticity problems",
journal = "Mathematics and Computers in Simulation",
volume = "61",
number = "3-6",
pages = "509--516",
month = JAN,
year = "2003",
abstract = "A portable MPI parallel FEM code is developed. Numerical tests for real-life engineering problems of the geomechanics in geosciences on a number of modem parallel computers are presented. The reported speed-up and parallel efficiency well illustrate the parallel features of the proposed method and its implementation. "
}
@Article{MGole2003,
author = "M. Golebiewski and H. Ritzdorf and J. L. Traff and F. Zimmermann",
title = "The {MPI}/{SX} implementation of {MPI} for {NEC}'s {SX}-6 and other {NEC} platforms",
journal = "NEC Research \& Development",
volume = "44",
number = "1",
pages = "69--74",
month = JAN,
year = "2003",
abstract = "MPI is the standard communication interface for programming parallel applications in the message passing paradigm. MPI/SX is a dedicated, efficient and highly optimized implementation of the full MPI-2 standard for the NEC SX-series of parallel vector supercomputers. MPI/SX is also the basis for implementations of MPI for other NEC parallel platforms, for instance MPI/EX for AzusA and AsAmA, and for the Earth Simulator. This paper gives an overview of the key features and recent developments of MPI/SX. Among these are: highly optimized point-to-point and one-sided communications both within a single, shared-memory node and across nodes; optimized collective operations; efficient, vectorized handling of non-contiguous user data; and a non-trivial implementation of the MPI topology functionality. Although particularattention has been paid to efficient utilization of the vector-capabilities of the SX-machines, the architecture and optimizations of MPI/SX are immediately applicable to other NEC architectures."
}
@Article{HUeha2003,
author = "H. Uehara and M. Tamura and M. Yokokawa",
title = "M{PI} performance measurement on the earth simulator",
journal = "NEC Research \& Development",
volume = "44",
number = "1",
pages = "75--79",
month = JAN,
year = "2003",
abstract = "MPI (Message Passing Interface) performance on the Earth Simulator is presented. Performance of MPI_Send, MPI Barrier, MPI RMA functions, and some programs of the exchange pattern have been evaluated on the Earth Simulator using the MPI benchmark program library. Regarding MPI performance on the Earth Simulator, the maximum throughput of the intranode communication is 14.8GB/s, and that of the internode communication is 11.8GB/s. The cost of MPI_Barrier call on the condition that the number of MPI-processes per PN is 1 is about 3.3 microseconds, and MPI_Barrier has excellent scalability. It has been also confirmed that programming using MPI RMA functions is suitable to program complicated communication patterns such as the exchange pattern."
}
@Article{WOhfu2003,
author = "W. Ohfuchi and S. Shingu and H. Fuchigami and M. Yamada",
title = "Dependence of the parallel performance of the atmospheric general circulation model for the earth simulator on problem size",
journal = "NEC Research \& Development",
volume = "44",
number = "1",
pages = "99--103",
month = JAN,
year = "2003",
abstract = "An atmospheric general circulation model (AGCM), named AFES, was extensively optimized for the Earth Simulator (ES), and achieved sustained performance of 26.58TFLOPS or 65 \\% of the peak performance with the full configuration of the ES under more or less ideal conditions. The sensitivity of AFES's parallel performance on problem size is measured under more practical conditions in this study. Even though the amount of computational operation of Legendre transform increases as O(M-3), where M is the truncated wavenumber,while that of physical parameterization does as O(M-2), the M dependence of the computational cost does not behave as expected from the amount of computational operation due to the vector efficiency. Some aspects of the costof communication are also discussed. The results suggest the following. 1)AFES has been developed mainly for super-high resolution, and its coding is very effective only at high resolution. It may be difficult to make AFES very efficient at any, especially low, resolution. 2) In order to maintain high parallel efficiency on the ES, it is essential to employ MPI coding that keeps message size sufficiently large for efficient utilization of the ES's communication ability, and to keep vector length sufficiently large forefficient vector processing."
}
@Article{PBode2003,
author = "P. Bode and J. P. Ostriker",
title = "Tree particle-mesh: {A}n adaptive, efficient, and parallel code for collisionless cosmological simulation",
journal = "Astrophysical Journal Supplement Series",
volume = "145",
number = "1",
pages = "1--13",
month = MAR,
year = "2003",
abstract = "An improved implementation of an N-body code for simulating collisionless cosmological dynamics is presented. TPM (tree particle-mesh) combines the PMmethod on large scales with a tree code to handle particle-particle interactions at small separations. After the global PM forces are calculated, spatially distinct regions above a given density contrast are located; the tree code calculates the gravitational interactions inside these denser objects at higher spatial and temporal resolution. The new implementation includes individual particle time steps within trees, an improved treatment of tidal forces on trees, new criteria for higher force resolution and choice of time step, and parallel treatment of large trees. TPM is compared to (PM)-M-3 and a tree code (GADGET) and is found to give equivalent results in significantly less time. The implementation is highly portable (requiring a FORTRAN compiler and MPI) and efficient on parallel machines. The source code can be found on the World Wide Web."
}
@Article{JPSch2003,
author = "J. P. Schulze and U. Lang",
title = "The parallelized perspective shear-warp algorithm for volume rendering",
journal = "Parallel Computing",
volume = "29",
number = "3",
pages = "339--354",
month = MAR,
year = "2003",
abstract = "In this paper, we present a new parallelized version of the perspective shear-warp algorithm. The parallelized algorithm was designed for distributed memory machines using MPI. The new algorithm takes advantage of the idea that the warp can be done in most computers' graphics hardware very fast, so that the remote parallel computer only needs to do the compositing. Our algorithm uses this idea to do the compositing on the remote machine, which transfers the resulting 2D intermediate image to the display machine. Even though the display machine can be a mid range PC or laptop computer, it can be used to display complex volumetric data, provided there is a network connection to a high performance parallel computer. Furthermore, remote rendering could be used to drive virtual environments, which typically require perspective projection and high frame rates for stereo projection and multiplescreens."
}
@Article{LChen2003,
author = "L. Chen and I. Fujishiro and K. Nakajima",
title = "Optimizing parallel performance of unstructured volume rendering for the {E}arth {S}imulator",
journal = "Parallel Computing",
volume = "29",
number = "3",
pages = "355--371",
month = MAR,
year = "2003",
abstract = "A scalable and high-performance parallel visualization subsystem has been developed in GeoFEM for the Earth Simulator (ES). As part of the ES project in Japan, the proposed subsystem is effective for the visualization of huge-scale unstructured datasets, and can be concurrent with computation on thesame high-performance parallel computer. Moreover, some parallel visualization modules have obtained a good parallel performance, covering scalar, vector and tensor fields. This paper will take volume rendering method as an example to describe a number of efficient parallel performance optimizationstrategies we adopted for large-scale unstructured data visualization on SMP cluster machines, including suitable design of visualization method, thethree-level hybrid parallelization which means message passing for inter-SMP node communication, loop directives for intra-SMP node parallelization, and vectorization for each processing element, plus dynamic load balancing.Good visualization. images and high parallel performance have been achieved on the ES, thus demonstrating the feasibility and effectiveness of the proposed method."
}
@Article{EAJoh2003,
author = "E. A. Johnson and C. Proppe and B. F. Spencer and L. A. Bergman and G. S. Szekely and Schueller",
title = "Parallel processing in computational stochastic dynamics",
journal = "Probabilistic Engineering Mechanics",
volume = "18",
number = "1",
pages = "37--60",
month = JAN,
year = "2003",
abstract = "Studying large complex problems that often arise in computational stochastic dynamics (CSD) demands significant computer power and data storage. Parallel processing can help meet these requirements by exploiting the computational and storage capabilities of multiprocessing computational environments. The challenge is to develop parallel algorithms and computational strategies that can take full advantage of parallel machines. This paper reviews some of the characteristics of parallel computing and the techniques used toparallelize computational algorithms in CSD. The characteristics of parallel processor environments are discussed, including parallelization through the use of message passing and parallelizing compilers. Several applications of parallel processing in CSD are then developed: solutions of the Fokker-Planck equation, Monte Carlo simulation of dynamical systems, and random eigenvector problems. In these examples, parallel processing is seen to be apromising approach through which to resolve some of the computational issues pertinent to CSD. "
}
@Article{ANeli2003,
author = "A. Nelisse and J. Maassen and T. Kielmann and H. E. Bal",
title = "C{CJ}: object-base message passing and collective communication in {J}ava",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "15",
number = "3-5",
pages = "341--369",
month = MAR-APR,
year = "2003",
abstract = "CCJ is a communication library that adds MPI-like message passing and collective operations to Java. Rather than trying to adhere to the precise MPI syntax, CCJ aims at a clean integration of communication into Java's object-oriented framework. For example, CCJ uses thread groups to support Java's multithreading model and it allows any data structure (not just arrays) to be communicated. CCJ is implemented entirely in Java, on top of RMI, so it can be used with any Java virtual machine. The paper discusses three parallel Java applications that use collective communication. It compares the performance (on top of a Myrinet cluster) of CCJ, RMI and mpiJava versions of these applications and also compares their code complexity. A detailed performance comparison between CCJ and mpiJava is given using the Java Grande Forum MPJ benchmark suite. The results show that neither CCJ's object-oriented design nor its implementation on top of RMI impose a performance penalty on applications compared to their mpiJava counterparts. The source of CCJ is available from our Web site http://www.cs.vu.nl/manta. "
}
@Article{LTang2003,
author = "L. Tang and R. E. Bartels and P. C. Chen and D. D. Liu",
title = "Numerical investigation of transonic limit cycle oscillations of a two-dimensional supercritical wing",
journal = "Journal of Fluids and Structures",
volume = "17",
number = "1",
pages = "29--41",
month = JAN,
year = "2003",
abstract = "CFD-based aeroelastic computations are performed to investigate the effect of nonlinear aerodynamics on transonic limit cycle oscillation (LCO)characteristics of the NLR7301 airfoil section. It is found that the LCO solutionsfrom Navier-Stokes computations deviate less from the experiment than an Euler solution but strongly depend on the employed turbulence model. The Degani-Schiff modification to the Baldwin-Lomax turbulence model provokes spurious vorticity spots causing multiple shocks which might be unphysical, while the Spalart-Allmaras turbulence model yields physically reasonable unsteady shocks. In the cases examined, smaller initial perturbations lead to larger LCO amplitudes and vice versa, in contradiction to what one might expect. The amplitude of the initial perturbation is also found to have an impact on the mean position of LCO. Also addressed in the paper are aspects of multiblock message passing interface (MPI) parallel computation techniques as related to the present problem."
}

@Article{JHGuo2003,
author = "J. H. Guo and T. R. Taha",
title = "Parallel implementation of the split-step and the pseudospectral methods for solving higher {K}d{V} equation",
journal = "Mathematics and Computers in Simulation",
volume = "62",
number = "1-2",
pages = "41--51",
month = FEB,
year = "2003",
abstract = "Numerical simulations show that higher order KdV equation under certain conditions has a self-focusing singularity, which means that the solution of the equation blows up in finite time. In this paper, two numerical schemes: the split-step Fourier transform and the pseudospectral methods are used toinvestigate this self-focusing singularity problem. Parallel algorithms for the proposed schemes are designed and implemented. FFTW-MPI algorithm designed by Matteo Frigo and Steven Johnson is used for parallel implementation of the discrete Fourier transform (DFT). The parallel algorithms are implemented on an SGI Origin 2000 multiprocessor computer and experiments show that a considerable speedup is attained."
}

@Article{DBore2003,
author = "D. Borello and A. Corsini and F. Rispoli",
title = "A finite element overlapping scheme for turbomachinery flows on parallel platforms",
journal = "Computers \& Fluids",
volume = "32",
number = "7",
pages = "1017--1047",
month = AUG,
year = "2003",
abstract = "Two- and three-dimensional turbomachinery flows in stationary and rotating compressor cascades are studied by using a one-level inexact explicit Schwarz method, and a cubic eddy viscosity turbulence closure. The message passing paradigm is used for the parallel implementation of the domain decomposition algorithm, allowing the solver portability on different parallel platforms. A convergence accelerator is proposed, based on a condensed cycle structure that merges the additive Schwarz iterations with the fixed point non-linear ones. The use of a stable finite element formulation on higher-order elements Q2-Q1 is addressed as a mean for retaining non-oscillatory and accurate solutions. Furthermore, the elementwise quadratic approximation is used to enable the exact implementation of higher-order integrals arising in the anisotropic turbulence closure adopted. Numerical campaigns are carried out on IBM SP2 and SP3, and CRAY T3E architectures, in order to demonstrate the portability. The accompanying performance improvement is assessed. Finally, the predicting capabilities are discussed with reference to challenging turbomachinery test cases: a transitional linear compressor cascade, and an isolated compressor rotor designed for non-free vortex operation. Convergence speed-up in such configurations is discussed."
}
@Article{TRabc2003,
author = "T. Rabczuk and J. Eibl",
title = "Simulation of high velocity concrete fragmentation using {SPH}/{MLSPH}",
journal = "International Journal for Numerical Methods in Engineering",
volume = "56",
number = "10",
pages = "1421--1444",
month = MAR,
year = "2003",
abstract = "The simulation of concrete fragmentation under explosive loading by a meshfree Lagrangian method, the smooth particle hydrodynamics method (SPH) is described. Two improvements regarding the completeness of the SPH-method are examined, first a normalization developed by Johnson and Beissel (NSPH) andsecond a moving least square (MLS) approach as modified by Scheffer (MLSPH). The SPH-Code is implemented in FORTRAN 90 and parallelized with MPI. A macroscopic constitutive law with isotropic damage for fracture and fragmentation for concrete is implemented in the SPH-Code. It is shown that the SPH-method is able to simulate the fracture and fragmentation of concrete slabs under contact detonation. The numerical results from the different SPH-methods are compared with the data from tests. The good agreement between calculation and experiment suggests that the SPH-program can predict the correct maximum pressure as well as the damage of the concrete slabs. Finally the fragment distributions of the tests and the numerical calculations are compared."
}
@Article{ACris2003,
author = "A. Cristobal-Salas and A. Tchernykh and J. L. Gaudiot and W. Y. Lin",
title = "Non-strict execution in parallel and distributed computing",
journal = "International Journal of Parallel Programming",
volume = "31",
number = "2",
pages = "77--105",
month = APR,
year = "2003",
abstract = "This paper surveys and demonstrates the power of non-strict evaluation in applications executed on distributed architectures. We present the design, implementation, and experimental evaluation of single assignment, incompletedata structures in a distributed memory architecture and Abstract Network Machine (ANM). Incremental Structures (IS), Incremental Structure Software Cache (ISSC), and Dynamic Incremental Structures (DIS) provide non-strict data access and fully asynchronous operations that make them highly suited for the exploitation of fine-grain parallelism in distributed memory systems. We focus on split-phase memory operations and non-strict information processing under a distributed address space to improve the overall system performance. A novel technique of optimization at the communication level is proposed and described. We use partial evaluation of local and remote memory accesses not only to remove much of the excess overhead of message passing,but also to reduce the number of messages when some information about the input or part of the input is known. We show that split-phase transactions of IS, together with the ability of deferring reads, allow partial evaluation of distributed programs without losing determinacy. Our experimental evaluation indicates that commodity PC clusters with both IS and a caching mechanism, ISSC, are more robust. The system can deliver speedup for both regular and irregular applications. We also show that partial evaluation of memory accesses decreases the traffic in the interconnection network and improves the performance of MPI IS and MPI ISSC applications."
}
@Article{VNAle2003,
author = "V. N. Alexandrov and I. T. Dimov and A. Karaivanova and C. J. K. Tan",
title = "Parallel {M}onte {C}arlo algorithms for information retrieval",
journal = "Mathematics and Computers in Simulation",
volume = "62",
number = "3-6",
pages = "289--295",
month = MAR,
year = "2003",
abstract = "The algorithms are running on a cluster of workstations under MPI and results of the experiments arising in textual retrieval of Web documents as wellas comparison of the stochastic methods proposed are presented. "
}
@Article{YMLia2003,
author = "Y. M. Li and H. M. Lu and T. W. Tang and S. M. Sze",
title = "A novel parallel adaptive {M}onte {C}arlo method for nonlinear {P}oisson equationin semiconductor devices",
journal = "Mathematics and Computers in Simulation",
volume = "62",
number = "3-6",
pages = "413--420",
month = MAR,
year = "2003",
abstract = "We present a parallel adaptive Monte Carlo (MC) algorithm for the numericalsolution of the nonlinear Poisson equation in semiconductor devices. Basedon a fixed random walk MC method, 1-irregular unstructured mesh technique,monotone iterative method, a posterior error estimation method, and dynamic domain decomposition algorithm, this approach is developed and successfully implemented on a 16-processors (16-PCs) Linux-cluster with message-passing interface (MPI) library. To solve the nonlinear problem with MC method, monotone iterative method is applied in each adaptive loop to obtain the final convergent solution. This approach fully exploits the inherent parallelism of the monotone iterative as well as MC methods. Numerical results for p-n diode and MOSFET devices are demonstrated to show the robustness of themethod. Furthermore, achieved parallel speedup and related parallel performances are also reported in this work. "
}
@Article{SCast2003,
author = "S. Castellaro and F. Mulargia",
title = "Implementing cellular automata models for earthquakes on parallel computers- art. no. 1204",
journal = "Geophysical Research Letters",
volume = "30",
number = "5",
pages = "1204--1204",
month = MAR,
year = "2003",
abstract = "[1] Cellular automata models require simulations on both large grids, to avoid border effects, and on a large number of realizations, to study the system properties under stationarity. Implementing the cellular automata codeson parallel computers would appear as an ideal solution. Unfortunately, the cellular automata models which are appropriate for earthquakes can only be partially parallelized because they have an intrinsically sequential component. Under extensive modeling using MPI on CRAY T3 and Origin 8300 supercomputers we show that a substantial speed-up can nevertheless be achieved by coarsening the system and making a few mild assumptions on the logical flow of the interactions among macrocells."
}
@Article{RBrig2003,
author = "R. Brightwell and R. Riesen and A. B. Maccabe",
title = "Design, implementation, and performance of {MPI} on {P}ortals 3.0",
journal = "International Journal of High Performance Computing Applications",
volume = "17",
number = "1",
pages = "7--20",
month = SPR,
year = "2003",
abstract = "This paper describes an implementation of the Message Passing Interface (MPI) on the Portals 3.0 data movement layer. Portals 3.0 provides low-level building blocks that are flexible enough to support higher-level message passing layers, such as MPI, very efficiently. Portals 3.0 is also designed toallow for programmable network interface cards to offload message processing from the host processor, allowing for the ability to overlap computationand MPI communication. We describe the basic building blocks in Portals 3.0, show how they can be put together to implement MPI, and describe the protocols of our MPI implementation. We look at several key operations within the implementation and describe the effects that a Portals 3.0 implementation has on scalability and performance. We also present preliminary performance results from our implementation for Myrinet."
}
@Article{FGarc2003,
author = "F. Garcia-Carballeira and A. Calderon and J. Carretero and J. Fernandez and J. M. Perez",
title = "The design of the expand parallel file system",
journal = "International Journal of High Performance Computing Applications",
volume = "17",
number = "1",
pages = "21--37",
month = SPR,
year = "2003",
abstract = "This article describes an implementation of MPI-IO using a new parallel file system, called Expand (Expandable Parallel File System), which is based on NFS servers. Expand combines multiple NFS servers to create a distributedpartition where files are striped. Expand requires no changes to the NFS server and uses RPC operations to provide parallel access to the same file. Expand is also independent of the clients, because all operations are implemented using RPC and NFS protocols. Using this system, we can join heterogeneous servers (Linux, Solaris, Windows 2000, etc.) to provide a parallel and distributed partition. The article describes the design, implementation and evaluation of Expand with MPI-IO. This evaluation has been made in Linuxclusters and compares Expand and PVFS."
}
@Article{RRabe2003,
author = "R. Rabenseifner and G. Wellein",
title = "Communication and optimization aspects of parallel programming models on hybrid architectures",
journal = "International Journal of High Performance Computing Applications",
volume = "17",
number = "1",
pages = "49--62",
month = SPR,
year = "2003",
abstract = "Most HPC systems are clusters of shared memory nodes. Parallel programming must combine the distributed memory parallelization on the node interconnect with the shared memory parallelization inside each node. The hybrid MPI+OpenMP programming model is compared with pure MPI, compiler based parallelization, and other parallel programming models on hybrid architectures. The paper focuses on bandwidth and latency aspects, and also on whether programming paradigms can separate the optimization of communication and computation. Benchmark results are presented for hybrid and pure MPI communication. This paper analyzes the strengths and weaknesses of several parallel programming models on clusters of SMP nodes."
}
@Article{SLafl2003,
author = "S. Laflamme and J. Dompierre and F. Guibault and R. Roy",
title = "Applying parmetis to structured remeshing for industrial {CFD} applications",
journal = "International Journal of High Performance Computing Applications",
volume = "17",
number = "1",
pages = "63--76",
month = SPR,
year = "2003",
abstract = "This paper presents the current strategy used in IP-OORT an ongoing projectto extend the application domain of a C++ toolkit library for iterative mesh adaptation. OORT is a class library for sequential structured, unstructured and hybrid mesh adaptation used mainly in the context of CFD computations, that performs iterative mesh refinement, coarsening and smoothing in 3D. Extensions to parallelize mesh adaptation using PARMETIS for domain decomposition and MPI high-level communication schemes are investigated here. Numerical simulations on realistic cases show that the parallel strategy scales with problem size and the number of processors, but singular behaviors are sometimes encountered at subdomain interfaces when conflicting instructions collide."
}
@Article{MPern2003,
author = "M. Pernpointner and L. Visscher",
title = "Parallelization of four-component calculations. {II}. {S}ymmetry-driven parallelization of the 4-spinor {CCSD} algorithm",
journal = "Journal of Computational Chemistry",
volume = "24",
number = "6",
pages = "754--759",
month = APR,
year = "2003",
abstract = "Given the importance of the Coupled-cluster (CC) method as an efficient andaccurate way to take electron correlation into account, we extend the parallelization technique in the second part of this series also to the 4-Spinor CCSD algorithm implemented in the Dirac-Fock packages DIRAC and MOLFDIR. The present implementation is based on the availability of the transformed molecular two-electron integrals on an external storage medium. The linearity of the CC equations in these two-electron integrals is used in a parallelization strategy that is based on distribution of the two largest integralclasses that carry three or four virtual spinor indices. The correspondingpartial contributions to the T-1 and T-2 amplitudes are calculated on eachnode and added using Message Passing Interface (MPI) library calls. Although we did not employ a master/slave principle, one specific node was assigned to also perform the remaining serial parts of the algorithm. In the critical sections considerable savings in storage requirements and computer time could be achieved, and this allows for computations on larger systems in the framework of four-component theory. "
}
@Article{YPana2003,
author = "Y. Pan and J. J. S. Shang and M. Guo",
title = "A scalable {HPF} implementation of a finite-volume computational electromagnetics application on a {CRAY} {T}3{E} parallel systemt",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "15",
number = "6",
pages = "607--621",
month = MAY,
year = "2003",
abstract = "The time-dependent Maxwell equations are one of the most important approaches to describing dynamic or wide-band frequency electromagnetic phenomena. A sequential finite-volume, characteristic-based procedure for solving the time-dependent, three-dimensional Maxwell equations has been successfully implemented in Fortran before. Due to its need for a large memory space and high demand on CPU time, it is impossible to test the code for a large array. Hence, it is essential to implement the code on a parallel computing system. In this paper, we discuss an efficient and scalable parallelization ofthe sequential Fortran time-dependent Maxwell equations solver using High Performance Fortran (HPF). The background to the project, the theory behindthe. efficiency being achieved, the parallelization methodologies employedand the experimental results obtained on the Cray T3E massively parallel computing system will be described in detail. Experimental runs show that the execution time is reduced drastically through parallel computing. The code is scalable up to 98 processors on the Cray T3E and has a performance similar to that of an MPI implementation. Based on the experimentation carriedout in this research, we believe that a high-level parallel programming language such as HPF is a fast, viable and economical approach to parallelizing many existing sequential codes which exhibit a lot of parallelism."
}
@Article{FIsai2003,
author = "F. Isaila and W. F. Tichy",
title = "Clusterfile: a flexible physical layout parallel file system",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "15",
number = "7-8",
pages = "653--679",
month = JUN-JUL,
year = "2003",
abstract = "This paper presents Clusterfile, a parallel file system that provides parallel file access on a cluster of computers. We introduce a file partitioningmodel that has been used in the design of Clusterfile. The model uses a data representation that is optimized for multidimensional array partitioningwhile allowing arbitrary partitions. The paper shows how the file model can be employed for file partitioning into both physical subfiles and logicalviews. We also present how the conversion between two partitions of the same file is implemented using a general memory redistribution algorithm. We show how we use the algorithm to optimize non-contiguous read and write operations. The experimental results include performance comparisons with the Parallel Virtual File System (PVFS) and an MPI-IO implementation for PVFS."
}
@Article{FYuas2003,
author = "F. Yuasa and K. Tobimatsu and S. Kawabata",
title = "Parallelization of the multidimensional integration package: {DICE}",
journal = "Nuclear Instruments \& Methods in Physics Research Section A-Accelerators Spectrometers Detectors and Associated Equipment",
volume = "502",
number = "2-3",
pages = "599--601",
month = APR,
year = "2003",
abstract = "We have parallelized the multidimensional integration package DICE by distributing sample points into the processors using MPI and evaluated its performance. "
}

@Article{SCDon2003,
author = "S. C. Dong and G. E. Karniadakis",
title = "P-refinement and {P}-threads",
journal = "Computer Methods in Applied Mechanics and Engineering",
volume = "192",
number = "19",
pages = "2191--2201",
month = "",
year = "2003",
abstract = {P-type refinement leads to exponential decay of numerical errors for sufficiently smooth solutions and has been used effectively in turbulence and structural mechanics simulations in the context of spectral and hp finite element discretizations. However, it induces a computational cost of O(Pd+1) ind dimensions, which is higher than lower-order methods. In this paper, we demonstrate that by employing multi-threading within MPI processes we manage to counter-balance the cost increase associated with P-refinement. This approach reduces effectively the wall clock time, and keeps it essentially constant as the polynomial order is increased while achieving exponential convergence rate. Since the number of threads within MPI processes can be dynamically adjusted through thread library functions, the algorithm can be readily adapted for dynamic P-refinement. The resulting hybrid MPI/threads dual-level parallelism is particularly suitable for modern supercomputers consisting of "symmetric multiprocessor" nodes. We demonstrate this approach in simulations of two three-dimensional fluid dynamics problems.}
}
@Article{KEkic2003,
author = "K. Ekici and A. S. Lyrintzis",
title = "A parallel {N}ewton-{K}rylov method for {N}avier-{S}tokes rotorcraft codes",
journal = "International Journal of Computational Fluid Dynamics",
volume = "17",
number = "3",
pages = "225--230",
month = MAY,
year = "2003",
abstract = "The application of Krylov subspace iterative methods to unsteady three-dimensional Navier-Stokes codes on massively parallel and distributed computingenvironments is investigated. Previously, the Euler mode of the Navier-Stokes flow solver Transonic Unsteady Rotor Navier-Stokes (TURNS) has been coupled with a Newton-Krylov scheme which uses two Conjugate-Gradient-like (CG) iterative methods. For the efficient implementation of Newton-Krylov methods to the Navier-Stokes mode of TURNS, efficient preconditioners must be used. Parallel implicit operators are used and compared as preconditioners. Results are presented for two-dimensional and three-dimensional viscous cases. The Message Passing Interface (MPI) protocol is used, because of its portability to various parallel architectures."
}
@Article{PAmic2003,
author = "P. Amico and L. Bosi and C. Cattuto and L. Gammaitoni and F. Marchesoni and Punturo",
title = "A parallel {B}eowulf-based system for the detection of gravitational waves in interferometric detectors",
journal = "Computer Physics Communications",
volume = "153",
number = "2",
pages = "179--189",
month = JUN,
year = "2003",
abstract = "The detection, in a modem interferometric detector like Virgo, of a gravitational wave signal from a coalescing binary stellar system is an intensive computational task both for the on-line and off-line computer systems. A parallel computing scheme using the Message Passing Interface (MPI) is described. Performance results on a small scale cluster are reported. "
}
@Article{SWGao2003,
author = "S. W. Gao",
title = "Linear-scaling parallelization of the {WIEN} package with {MPI}",
journal = "Computer Physics Communications",
volume = "153",
number = "2",
pages = "190--198",
month = JUN,
year = "2003",
abstract = "A parallel version of the WIEN package, the full-potential linearized Augmented Planewave (FP-LAPW) code for ab initio electron structure calculation,has been developed using the message passing interface (MPI). All time-consuming parts of the self-consistent cycle, namely, the matrix setting, the eigen-solver, and the charge density and potential generators, have been parallelized on the level of the plane-wave basis, wherever possible, and/or of atomic loops. Test calculations done on Linux commodity cluster and the IBM power3 supercomputers show that the parallel code attains nearly linearscaling for almost all the time-consuming calculations. It opens the possibility to handle large systems with the full-potential method on the parallel platforms. "
}

@Article{VBlan2003,
author = "V. Blanco and P. Gonzalez and J. C. Cabaleiro and D. B. Heras and T. F. Pena and Pombo",
title = "A{VISPA}: visualizing the performance prediction of parallel iterative solvers",
journal = "Future Generation Computer Systems",
volume = "19",
number = "5",
pages = "721--733",
month = JUL,
year = "2003",
abstract = "The selection of the best method and preconditioner for solving a sparse linear system is as determinant as the efficient parallelization of the selected method. We propose a tool for helping to solve both problems on distributed memory multiprocessors using iterative methods. Based on a previously developed library of HPF and message-passing interface (MPI) codes, a performance prediction is developed and a visualization tool (AVISPA) is proposed. The tool combines theoretical features of the methods and preconditioners with practical considerations and predictions about aspects of the execution performance (computational cost, communications overhead, etc.). It offers detailed information about all the topics that can be useful for selecting the most suitable method and preconditioner. Another capability is to offer information on different parallel implementations of the code (HPF andMPI) varying the number of available processors. "
}

@Article{RHReu2003,
author = "R. H. Reussner",
title = "Using {SK}a{MPI} for developing high-performance {MPI} programs with performance portability",
journal = "Future Generation Computer Systems",
volume = "19",
number = "5",
pages = "749--759",
month = JUL,
year = "2003",
abstract = "The current practice of developing high-performance software for parallel computers includes a tuning phase where the software's performance is optimised for a specific hardware platform. This tuning phase often is costly andresults in machine-specific, hence, less portable software. In this paper we present a publicly available database providing performance data for operations of the message-passing-interface (MPI) measured on several different platforms. This allows to design MPI programs for performance and portability in early stages of software development. Considering the performance of MPI operations while designing programmes allows the software developer (a) to select the fastest implementation alternative, (b) to write performance portable software (i.e., software showing high performance on several platforms without platform-specific tuning), if possible, and (c) to quantifythe tradeoff between ultimate performance and performance portability for different platforms."
}
@Article{DLazz2003,
author = "D. Lazzaro",
title = "A parallel multivariate interpolation algorithm with radial basis functions",
journal = "International Journal of Computer Mathematics",
volume = "80",
number = "7",
pages = "907--919",
month = JUL,
year = "2003",
abstract = {This paper presents an efficient and highly scalable parallel version of the Modified RBF Shepard's method presented in [5]. This method maintains the "metric" nature and the advantages of Shepard's method and, at the same time, improves its accuracy by exploiting the characteristics of flexibility and accuracy which have made the radial basis functions a well-established tool for multivariate interpolation. Due to its locality, this method can be easily and efficiently parallelized on a distributed memory parallel architecture. The performance of the parallel algorithm has been studied theoretically and the experimental results obtained by running its implementationon a Cray T3E parallel machine, using the MPI interface, confirm the theoretical efficiency.}
}
@Article{DShir2003,
author = "D. Shires and R. Mohan",
title = "Optimization and performance of a {F}ortran 90 {MPI}-based unstructured code onlarge-scale parallel systems",
journal = "Journal of Supercomputing",
volume = "25",
number = "2",
pages = "131--141",
month = JUN,
year = "2003",
abstract = "The message-passing interface (MPI) has become the standard in achieving effective results when using the message passing paradigm of parallelization.Codes written using MPI are extremely portable and are applicable to both clusters and massively parallel computing platforms. Since MPI uses the single program, multiple data (SPMD) approach to parallelism, good performancerequires careful tuning of the serial code as well as careful data and control flow analysis to limit communication. We discuss optimization strategies used and their degree of success to increase performance of an MPI-basedunstructured finite element simulation code written in Fortran 90. We discuss performance results based on implementations using several modern massively parallel computing platforms including the SGI Origin 3800, IBM Nighthawk 2 SMP, and Cray T3E-1200."
}
@Article{PAFar2003,
author = "P. A. Farrell and H. Ong",
title = "Factors involved in the performance of computations on {B}eowulf clusters",
journal = "Electronic Transactions on Numerical Analysis",
volume = "15",
number = "pp. 211-224.",
pages = "",
month = "",
year = "2003",
abstract = "We comment on the relative performance of LAM, MPICH, and MVICH on a Linux cluster connected by a Gigabit Ethernet network. Since LAM and MPICH use the TCP/IP socket interface for communicating messages, it is critical to have high TCP/IP performance for these to give satisfactory results. Despite many efforts to improve TCP/IP performance, the performance graphs presentedhere indicate that the overhead incurred in protocol stack processing is still high. We discuss the Virtual Interface Architecture ( VIA) which is intended to provide low latency, high bandwidth message-passing between user processes. Developments such as the VIA-based MPI implementation MVICH can improve communication throughput and thus give the promise of enabling distributed applications to improve performance. Finally we present some examples of how these various choices can impact the performance of an example multigrid code."
}

@Article{ARubi2003,
author = "A. Rubinstein and F. Rachidi and M. Rubinstein and B. Reusser",
title = "A parallel implementation of {NEC} for the analysis of large structures",
journal = "IEEE Transactions on Electromagnetic Compatibility",
volume = "45",
number = "2",
pages = "177--188",
month = MAY,
year = "2003",
abstract = "We present a new, parallel version of the numerical electromagnetics code (NEC). The parallelization is based on a bidimensional block-cyclic distribution of matrices on a rectangular processor grid, assuring a theoretically optimal load balance among the processors. The code is portable to any platform supporting message passing parallel environments such as message passing interface and parallel virtual machine, where it could even be executed on heterogeneous clusters of computers running on different operating systems. The developed parallel NEC was successfully implemented on two parallelsupercomputers featuring different architectures to test portability. Large structures containing up to 24000 segments, which exceeds currently available computer resources were successfully executed and timing and memory results are presented. The code is applied to analyze the penetration of electromagnetic fields inside a vehicle. The computed results are validated using other numerical methods and experimental data obtained using a simplified model of a vehicle (consisting essentially of the body shell) illuminatedby an electromagnetic pulse (EMP) simulator."
}
@Article{JCNoa2003,
author = "J. C. No and R. Thakur and A. Choudhary",
title = "High-performance scientific data management system",
journal = "Journal of Parallel and Distributed Computing",
volume = "63",
number = "4",
pages = "434--447",
month = APR,
year = "2003",
abstract = "Many scientific applications have large I/O requirements, in terms of both the size of data and the number of files or data sets. Management, storage,efficient access, and analysis of this data present an extremely challenging task. Traditionally, two different solutions have been used for this task: file I/O or databases. File I/O can provide high performance but is tedious to use with large numbers of files and large and complex data sets. Databases can be convenient, flexible, and powerful but do not perform and scale well for parallel supercomputing, applications. We have developed a software system, called Scientific Data Manager (SDM), that combines the good features of both file I/O and databases. SDM provides a high-level application programming interface to the user and, internally, uses a parallel file system to store real data (using various I/O optimizations available in MPI-IO) and a database to store application-related metadata. In order to support I/O in irregular applications, SDM makes extensive use of MPI-IO's noncontiguous collective I/O functions. Moreover, SDM uses the concept of a history file to optimize the cost of the index distribution using the metadatastored in database. We describe the design and implementation of SDM and present performance results with two regular applications, ASTRO3D and an Euler solver, and with two irregular applications, a CFD code called FUN3D and a Rayleigh-Taylor instability code."
}
@Article{PLina2003,
author = "P. Lin and Q. P. Guo and X. Q. Chen",
title = "A fully explicit method for incompressible flow computation",
journal = "Computer Methods in Applied Mechanics and Engineering",
volume = "192",
number = "22-24",
pages = "2555--2564",
month = "",
year = "2003",
abstract = "A new formulation of the Navier-Stokes equations is introduced to solve incompressible flow problems. It keeps the benefits of the penalty method, that is, velocity and pressure can be obtained separately and no pressure-Poisson equation is involved. Unlike the penalty method the formulation is morestable or less stiff and then explicit time integration can be applied foreasy implementation. No linear or nonlinear system need be solved in the method. In the case that a large number of time steps are needed a parallelization based on domain decomposition is applied to reduce the computationaltime. With the explicit time integration the parallel implementation and its message passing are very simple as well. "
}
@Article{SDShe2003,
author = "S. D. Shellman and J. P. Lewis and K. R. Glaesemann and K. Sikorski and G. A. Voth",
title = "Massively parallel linear-scaling algorithm in an ab initio local-orbital total-energy method",
journal = "Journal of Computational Physics",
volume = "188",
number = "1",
pages = "1--15",
month = JUN,
year = "2003",
abstract = {Similar to the manner of S. Itoh et al. [Comp. Phys. Commun. 88 (1995) 173], we report implementation of a massively parallel linear-scaling algorithminto an ab initio tight-binding method called FIREBALL [Phys. Rev. B (2001)]. The use of local-orbitals yields a very sparse Hamiltonian matrix whichfacilitates using a linear-scaling algorithm to obtain the electronic band-structure energy. The general functional form of Kim et al. [Phys. Rev. B 52 (1995) 1640], which minimizes a functional to obtain the electronic band-structure energy, has been parallelized utilizing the conjugate gradient method. The results of this approach are reported here. In addition, the useof "fireball" wavefunctions, where the wavefunctions are explicitly zero beyond some cutoff, allows for pre-generating all integrals describing two- and three-center interactions. The computation of these integrals is then an easily parallelizable problem for which the results are reported. Both integral generation and the linear-scaling optimization procedures are parallelized using the standard MPI message passing interface mixed with an OpenMP strategy.}
}
@Article{NTKar2003,
author = "N. T. Karonis and B. Toonen and I. Foster",
title = "M{PICH}-{G}2: {A} {G}rid-enabled implementation of the {M}essage {P}assing {I}nterface",
journal = "Journal of Parallel and Distributed Computing",
volume = "63",
number = "5",
pages = "551--563",
month = MAY,
year = "2003",
abstract = {Application development for distributed-computing "Grids" can benefit from tools that variously hide or enable application-level management of critical aspects of the heterogeneous environment. As part of an investigation of these issues, we have developed MPICH-G2, a Grid-enabled implementation of the Message Passing Interface (MPI) that allows a user to run MPI programs across multiple computers, at the same or different sites, using the same commands that would be used on a parallel computer. This library extends theArgonne MPICH implementation of MPI to use services provided by the GlobusToolkit for authentication, authorization, resource allocation, executablestaging, and I/O, as well as for process creation, monitoring, and control. Various performance-critical operations, including startup and collectiveoperations, are configured to exploit network topology information. The library also exploits MPI constructs for performance management; for example, the MPI communicator construct is used for application-level discovery of,and adaptation to, both network topology and network quality-of-service mechanisms. We describe the MPICH-G2 design and implementation, present performance results, and review application experiences, including record-setting distributed simulations.}
}
@Article{RLGra2003,
author = "R. L. Graham and S. E. Choi and D. J. Daniel and N. N. Desai and R. G. Minnich and Rasmussen",
title = "A network-failure-tolerant message-passing system for terascale clusters",
journal = "International Journal of Parallel Programming",
volume = "31",
number = "4",
pages = "285--303",
month = AUG,
year = "2003",
abstract = "The Los Alamos Message Passing Interface (LA-MPI) is an end-to-end network-failure-tolerant message-passing system designed for terascale clusters. LA-MPI is a standard-compliant implementation of MPI designed to tolerate network-related failures including I/O bus errors, network card errors, and wire-transmission errors. This paper details the distinguishing features of LA-MPI, including support for concurrent use of multiple types of network interface, and reliable message transmission utilizing multiple network pathsand routes between a given source and destination. In addition, performance measurements on production-grade platforms are presented."
}
@Article{JLoho2003,
author = "J. Lohout and A. D. George",
title = "A high-performance communication service for parallel computing on distributed {DSP} systems",
journal = "Parallel Computing",
volume = "29",
number = "7",
pages = "851--878",
month = JUL,
year = "2003",
abstract = "Rapid increases in the complexity of algorithms for real-time signal processing applications have led to performance requirements exceeding the capabilities of conventional digital signal processor (DSP) architectures. Many applications, such as autonomous sonar arrays, are distributed in nature andamenable to parallel computing on embedded systems constructed from multiple DSPs networked together. However, to realize the full potential of such applications, a lightweight service for message-passing communication and parallel process coordination is needed that is able to provide high throughput and low latency while minimizing processor and memory utilization. Thispaper presents the design and analysis of such a service, based on the message passing interface specification, for unicast and collective communications. "
}
@Article{XYZhu2003,
author = "X. Y. Zhu and L. Carin and T. Dogaru",
title = "Parallel implementation of the biorthogonal multiresolution time-domain method",
journal = "Journal of the Optical Society of America A-Optics Image Science and Vision",
volume = "20",
number = "5",
pages = "844--855",
month = MAY,
year = "2003",
abstract = "The three-dimensional biorthogonal multiresolution time-domain (Bi-MRTD) method is presented for both free-space and half-space scattering problems. The perfectly matched layer (PML) is used as an absorbing boundary condition. It has been shown that improved numerical-dispersion properties can be obtained with the use of smooth, compactly supported wavelet functions as thebasis, whereas we employ the Cohen-Daubechies-Fouveau (CDF) biorthogonal wavelets. When a CDF-wavelet expansion is used, the spatial-sampling rate can be reduced considerably compared with that of the conventional finite-difference time-domain (FDTD) method,. implying that larger targets can be simulated without sacrificing accuracy. We implement the Bi-MRTD on a cluster of allocated-memory machines, using the message-passing interface (MPI), such that very large targets can be modeled. Numerical results are compared with analytical ones and with those obtained by use of the traditional FDTD method."
}
@Article{PRAme2003,
author = "P. R. Amestoy and I. S. Duff and J. Y. L'Excellent and X. Y. S. Li",
title = "Impact of the implementation of {MPI} point-to-point communications on the performance of two general sparse solvers",
journal = "Parallel Computing",
volume = "29",
number = "7",
pages = "833--849",
month = JUL,
year = "2003",
abstract = "We examine the send and receive mechanisms of MPI and show how to implementmessage passing robustly so that performance is not significantly affectedby changes to the MPI system. We discuss this within the context of two different parallel algorithms for sparse Gaussian elimination: a multifrontalsolver (MUMPS), and a supernodal one (SuperLU). The performance of our initial strategies based on simple MPI point-to-point communication primitivesis very sensitive to the MPI system, particularly the way MPI buffers are used. Using nonblocking communication primitives improves the performance and robustness, but at the cost of increased code complexity."
}
@Article{WSZha2003,
author = "W. S. Zhang and G. Q. Zhang",
title = "Factorization synthesized-shot prestack depth migration in the helical coordinate system",
journal = "Chinese Journal of Geophysics-Chinese Edition",
volume = "46",
number = "4",
pages = "520--525",
month = JUL,
year = "2003",
abstract = "Based on the synthesized-shot prestack depth migration, a new high efficient method for the synthesized-shot prestack depth migration is proposed. It is a hybrid technique which implements wavefield extrapolation with factorization in the helical coordinate system. The wavefield extrapolation is divided into two explicit solving processes. One is a causal process, and the other is an anticausal process. Such explicit solving processes in the helical coordinate system can improve the wavefield extrapolation efficiency. Moreover, based on the phase encoding principle, the synthesized-wavefield corresponding to multiple ray parameters is encoded and stacked. Then the calculations are implemented with MPI parallel algorithm based on the ray parameters. Thus the calculation efficiency is enhanced further. After deriving the relevant formulae and analyzing the computation cost quantitatively, numerical calculations for the Marmousi complex model are carried out and results comparisons are made. The imaging results show that the method presented in this paper has the advantages of high precision and good efficiency. So it can be applied to practical data processing."
}
@Article{GRLue2003,
author = "G. R. Luecke and M. Kraeva and L. L. Ju",
title = "Comparing the performance of {MPICH} with {C}ray's {MPI} and with {SGI}'s {MPI}",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "15",
number = "9",
pages = "779--802",
month = AUG,
year = "2003",
abstract = "The purpose of this paper is to compare the performance of MPICH with the vendor Message Passing Interface (MPI) on a Cray T3E-900 and an SGI Origin 3000. Seven basic communication tests which include basic point-to-point andcollective MPI communication routines were chosen to represent commonly-used communication patterns. Cray's MPI performed better (and sometimes significantly better) than Mississippi State University's (MSU's) MPICH for small and medium messages. They both performed about the same for large messages, however for three tests MSU's MPICH was about 20\\% faster than Cray's MPI. SGI's MPI performed and scaled better (and sometimes significantly better) than MPICH for all messages, except for the scatter test where MPICH outperformed SGI's MPI for 1 kbyte messages. The poor scalability of MPICH on the Origin 3000 suggests there may be scalability problems with MPICH. "
}
@Article{JHKri2003,
author = "J. H. Kristensen and I. Farnan",
title = "Efficient solid state {NMR} powder simulations using {SMP} and {MPP} parallel computation",
journal = "Journal of Magnetic Resonance",
volume = "161",
number = "2",
pages = "183--190",
month = APR,
year = "2003",
abstract = "Methods for parallel simulation of solid state NMR powder spectra are presented for both shared and distributed memory parallel supercomputers. For shared memory architectures the performance of simulation programs implementing the OpenMP application programming interface is evaluated. It is demonstrated that the design of correct and efficient shared memory parallel programs is difficult as the performance depends on data locality and cache memory effects. The distributed memory parallel programming model is examined for simulation programs using the MPI message passing interface. The resultsreveal that both shared and distributed memory parallel computation are very efficient with an almost perfect application speedup and may be applied to the most advanced powder simulations. "
}
@Article{NTKar2003b,
author = "N. T. Karonis and M. E. Papka and J. Binns and J. Bresnahan and J. A. Insley and D. Jones and Link",
title = "High-resolution remote rendering of large datasets in a collaborative environment",
journal = "Future Generation Computer Systems",
volume = "19",
number = "6",
pages = "909--917",
month = AUG,
year = "2003",
abstract = "In a time when computational and data resources are distributed around the globe, users need to interact with these resources and each other easily and efficient. The Grid, by definition, represents a connection of distributed resources that can be used regardless of the user's location. We have built a prototype visualization system using the Globus Toolkit, MPICH-G2, andthe Access Grid in order to explore how future scientific collaborations may occur over the Grid. We describe our experience in demonstrating our system at iGrid2002, where the United States and the Netherlands were connected via a high-latency, high-bandwidth network. In particular, we focus on issues related to a Grid-based application that couples a collaboration component (including a user interface to the Access Grid) with a high-resolutionremote rendering component."
}
@Article{RVald2003,
author = "R. Valdarnini",
title = "Parallelization of a treecode",
journal = "New Astronomy",
volume = "8",
number = "7",
pages = "691--710",
month = SEP,
year = "2003",
abstract = "I describe here the performance of a parallel treecode with individual particle timesteps. The code is based on the Barnes-Hut algorithm and runs cosmological N-body simulations on parallel machines with a distributed memory architecture using the MPI message-passing library. For a configuration with a constant number of particles per processor the scalability of the code was tested up to P = 128 processors on an IBM SP4 machine. In the large P limit the average CPU time per processor necessary for solving the gravitational interactions is similar to 10\\% higher than that expected from the ideal scaling relation. The processor domains are determined every large timestep according to a recursive orthogonal bisection, using a weighting scheme which takes into account the total particle computational load within the timestep. The results of the numerical tests show that the load balancing efficiency L of the code is high (greater than or similar to 90\\%) up to P = 32, and decreases to L similar to 80\\% when P = 128. In the latter case it isfound that some aspects of the code performance are affected by machine hardware, while the proposed weighting scheme can achieve a load balance as high as L similar to 90\\% even in the large P limit."
}
@Article{THeya2003,
author = "T. Hey and A. Trefethen",
title = "e-science and its implications",
journal = "Philosophical Transactions of the Royal Society of London Series A-Mathematical Physical and Engineering Sciences",
volume = "361",
number = "1809",
pages = "1809--1825",
month = AUG,
year = "2003",
abstract = "After a definition of e-science and the Grid, the paper begins with an overview of the technological context of Grid developments. NASA's Information Power Grid is described as an early example of a 'prototype production Grid'. The discussion of e-science and the Grid is then set in the context of the UK e-Science Programme and is illustrated with reference to some UK e-science projects in science, engineering and medicine. The Open Standards approach to Grid middleware adopted by the community in the Global Grid Forum is described and compared with community-based standardization processes used for the Internet, MPI, Linux and the Web. Some implications of the imminent data deluge that will arise from the new generation of e-science experiments in terms of archiving and curation are then considered. The paper concludes with remarks about social and technological issues posed by Grid-enabled 'collaboratories' in both scientific and commercial contexts."
}
@Article{FRonq2003,
author = "F. Ronquist and J. P. Huelsenbeck",
title = "Mr{B}ayes 3: {B}ayesian phylogenetic inference under mixed models",
journal = "Bioinformatics",
volume = "19",
number = "12",
pages = "1572--1574",
month = AUG,
year = "2003",
abstract = "MrBayes 3 performs Bayesian phylogenetic analysis combining information from different data partitions or subsets evolving under different stochastic evolutionary models. This allows the user to analyze heterogeneous data sets consisting of different data types-e.g. morphological, nucleotide, and protein- and to explore a wide variety of structured models mixing partition-unique and shared parameters. The program employs MPI to parallelize Metropolis coupling on Macintosh or UNIX clusters."
}

@Article{KBLi2003,
author = "K. B. Li",
title = "Clustal{W}-{MPI}: {C}lustal{W} analysis using distributed and parallel computing",
journal = "Bioinformatics",
volume = "19",
number = "12",
pages = "1585--1586",
month = AUG,
year = "2003",
abstract = "ClustalW is a tool for aligning multiple protein or nucleotide sequences. The alignment is achieved via three steps: pairwise alignment, guide-tree generation and progressive alignment. ClustalW-MPI is a distributed and parallel implementation of ClustalW. All three steps have been parallelized to reduce the execution time. The software uses a message-passing library called MPI (Message Passing Interface) and runs on distributed workstation clusters as well as on traditional parallel computers."
}
@Article{SGoed2003,
author = "S. Goedecker and M. Boulet and T. Deutsch",
title = "An efficient 3-dim {FFT} for plane wave electronic structure calculations on massively parallel machines composed of multiprocessor nodes",
journal = "Computer Physics Communications",
volume = "154",
number = "2",
pages = "105--110",
month = AUG,
year = "2003",
abstract = "Three-dimensional Fast Fourier Transforms (FFTs) are the main computationaltask in plane wave electronic structure calculations. Obtaining a high performance on a large numbers of processors is non-trivial on the latest generation of parallel computers that consist of nodes made up of a shared memory multiprocessors. A non-dogmatic method for obtaining high performance for such 3-dim FFTs in a combined MPI/OpenMP programming paradigm will be presented. Exploiting the peculiarities of plane wave electronic structure calculations, speedups of up to 160 and speeds of up to 130 Gflops were obtained on 256 processors. "
}
@Article{BGLar2003,
author = "B. G. Larwood and N. P. Weatherill and O. Hassan and K. Morgan",
title = "Domain decomposition approach for parallel unstructured mesh generation",
journal = "International Journal for Numerical Methods in Engineering",
volume = "58",
number = "2",
pages = "177--188",
month = SEP,
year = "2003",
abstract = "In this paper, a method to generate large unstructured meshes on parallel computers is demonstrated. Using the Message Passing Interface, a coarse-grained parallel harness has been developed, that allows the use of sequentialgenerators in a parallel environment. Meshes of over 500 million elements will be shown."
}
@Article{RGJia2004,
author = "R. G. Jia and B. Sunden",
title = "Parallelization of a multi-blocked {CFD} code via three strategies for fluid flow and heat transfer analysis",
journal = "Computers \& Fluids",
volume = "33",
number = "1",
pages = "57--80",
month = JAN,
year = "2004",
abstract = "This paper reports on a parallel implementation of a general 3D multi-blockCFD code. The parallelization is achieved by using three strategies. Firstly, it is done on dual-processor PC-clusters where Windows NT systems are running. A multi-thread programming model is adopted for the multi-block code, where one thread corresponds to a block. Shared-memory is used for the exchange of inner-boundaries between neighboring blocks (threads) on the same node, while WinSockets are employed for those on different nodes. Secondly, the parallelization is extended to UNIX operating system. MPI is appliedfor all the message passing between different processors, including those on the same node. Thirdly, Pthreads (POSIX threads), a standardized application interface for threads, are adopted to take the advantage of the shared-memory feature of the SMP nodes, while MPI is only applied for the messagepassing between processors on different nodes. In all the strategies, a static load-balancing method is employed for equitable distribution of computational work to specified nodes. The parameters of the present code is studied in detail to facilitate the explanation of the speedup results. Two examples are provided to show the speedup and load balancing of the parallel calculation. Detailed comparison is made to evaluate the efficiency of different strategies."
}
@Article{HLTru2003,
author = "H. L. Truong and T. Fahringer",
title = "S{CALEA}: a performance analysis tool for parallel programs",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "15",
number = "11-12",
pages = "1001--1025",
month = SEP,
year = "2003",
abstract = "In this paper we present SCALEA, which is a performance instrumentation, measurement, analysis, and visualization tool for parallel programs that supports post-mortem performance analysis. SCALEA currently focuses on performance analysis for OpenMP, MPI, HPF, and mixed parallel programs. It computesa variety of performance metrics based on a novel classification of overhead. SCALEA also supports multi-experiment performance analysis that allows one to compare and to evaluate the performance outcome of several experiments. A highly flexible instrumentation and measurement system is provided which can be controlled by command-line options and program directives. SCALEA can be interfaced by external tools through the provision of a full Fortran90 OpenMP/MPI/HPF frontend that allows one to instrument an abstract syntax tree at a very high-level with C-function calls and to generate source code. A graphical user interface is provided to view a large variety of performance metrics at the level of arbitrary code regions, threads, processes,and computational nodes for single- and multi-experiment performance analysis."
}
@Article{MLang2003,
author = "M. Langlais and G. Latu and J. Roman and P. Silan",
title = "Performance analysis and qualitative results of an efficient parallel stochastic simulator for a marine host-parasite system",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "15",
number = "11-12",
pages = "1133--1150",
month = SEP,
year = "2003",
abstract = "We are interested in a host-parasite system, i.e. the sea bass-Diplectanum aequans system. A discrete mathematical model is used to describe the dynamics of both populations. Our goal is notably to validate the model in the context of aquaculture. A deterministic numerical simulator and, recently, astochastic simulator were developed to study this biological system. Parallelization is required because the execution times are too long. The Monte Carlo algorithm of the stochastic simulator and its three levels of parallelism are described. Analysis and performances, up to 256 processors, of a hybrid MPI/OpenMP code are then presented for a cluster of symmetric multi-processor (SMP) nodes. Qualitative results are given for the host-macroparasite system simulation."
}
@Article{PDMic2003,
author = "P. D. Michailidis and K. G. Margaritis",
title = "Performance evaluation of load balancing strategies for approximate string matching application on an {MPI} cluster of heterogeneous workstations",
journal = "Future Generation Computer Systems",
volume = "19",
number = "7",
pages = "1075--1104",
month = OCT,
year = "2003",
abstract = "In this paper, we present three parallel approximate string matching methods on a parallel architecture with heterogeneous workstations to gain supercomputer power at low cost. The first method is the static master-worker with uniform distribution strategy, the second one is the dynamic master-worker with allocation of subtexts and the third one is the dynamic master-worker with allocation of text pointers. Further, we propose a hybrid parallel method that combines the advantages of static and dynamic parallel methods in order to reduce the load imbalance and communication overhead. This hybrid method is based on the following optimal distribution strategy: the text collection is distributed proportional to workstation's speed. We evaluatedand compared the performance of the four methods with clusters one, two, four, six and eight heterogeneous workstations. The experimental results demonstrate that dynamic allocation of text pointers and hybrid methods achieve better performance than the two original ones. We also present an analytical performance model for the four methods that confirms the actual behaviour of the experimental results. "
}
@Article{MGove2003,
author = "M. Govett and L. Hart and T. Henderson and J. Middlecoff and D. Schaffer",
title = "The {S}calable {M}odeling {S}ystem: directive-based code parallelization for distributed and shared memory computers",
journal = "Parallel Computing",
volume = "29",
number = "8",
pages = "995--1020",
month = AUG,
year = "2003",
abstract = "A directive-based parallelization tool called the Scalable Modeling System (SMS) is described. The user inserts directives in the form of comments into existing Fortran code. SMS translates the code and directives into a parallel version that runs efficiently on shared and distributed memory high-performance computing platforms including the SGI Origin, IBM SP2, Cray T3E, Sun, and Alpha and Intel clusters. Twenty directives are available to support operations including array re-declarations, inter-process communications, loop translations, and parallel I/O operations. SMS also provides tools to support incremental parallelization and debugging that significantly reduces code parallelization. time from months to weeks of effort. SMS is intended for applications using regular structured grids that are solved using finite difference approximation or spectral methods. It has been used to parallelize 10 atmospheric and oceanic models, but the tool is sufficiently general that it can be applied to other structured grids codes. Recent performance comparisons demonstrate that the Eta, Hybrid Coordinate Ocean model and Regional Ocean Modeling System model, parallelized using SMS, perform aswell or better than their OpenMP or Message Passing Interface counterparts. "
}
@Article{MYama2003,
author = "M. Yamashita and K. Fujisawa and M. Kojima",
title = "S{DPARA}: {S}emi{D}efinite {P}rogramming {A}lgorithm pa{RA}llel version",
journal = "Parallel Computing",
volume = "29",
number = "8",
pages = "1053--1067",
month = AUG,
year = "2003",
abstract = "The SDPA (SemidDefinite Programming Algorithm) is known as efficient computer software based on the primal-dual interior-point method for solving SDPs(SemiDefinite Programs). In many applications, however, some SDPs become larger and larger, too large for the SDPA to solve on a single processor. Inexecution of the SDPA applied to large scale SDPs, the computation of the so-called Schur complement matrix and its Cholesky factorization consume most of the computational time. The SDPARA (SemiDefinite Programming Algorithm paRAllel version) is a parallel version of the SDPA on multiple processors and distributed memory, which replaces these two parts by their parallel implementation using MPI and ScaLAPACK. Through numerical results, we show that the SDPARA on a PC cluster consisting of 64 processors attains high scalability for large scale SDPs without losing the stability of the SDPA."
}
@Article{NSaka,
author = "N. Sakai and N. Hata and H. Liao and T. Dohi",
title = "High performance computing for parallel rendering in surgical auto stereoscopic display and navigation",
journal = "Cars 2003: Computer Assisted Radiology and Surgery",
volume = "2003",
number = "1256",
pages = "403--407",
OPTnote = "Proceedings",
year = "",
abstract = "The three-dimensional medical information obtained from Magnetic Resonance Image (MRI), Xray Computed Tomography (CT), etc. is used for an operation-supporting image, and operations under image guidance have also been conducted. One of the stereoscopic methods is Integral Videography (IV), which is an animated extension of Integral Photography, reproducing a computer-generated graphical object. Though the advantage of IV has been proven in both feasibility studies and clinical applications, one of the issues still unsolved is the notable quantity of calculation that causes significant delay inrendering. Then using parallel processing method integrating Message Passing Inter-face (MPI) on High Performance Computer (HPC), we shortened the calculating time of IV picture by the shortest at about 0.2 s. Furthermore, by using socket communication, it transmitted to another portable note PC, which is for a display. Then, we achieved the fast presentation of autostereoscopic images seen from an arbitrary direction that was specified with themouse from the note PC side at anywhere we can access to the network."
}
@Article{XZFen2003,
author = "X. Z. Feng and D. A. Buell and J. R. Rose and P. J. Waddell",
title = "Parallel algorithms for {B}ayesian phylogenetic inference",
journal = "Journal of Parallel and Distributed Computing",
volume = "63",
number = "7-8",
pages = "707--718",
month = JUL-AUG,
year = "2003",
abstract = "This paper describes parallel algorithms and their MPI-based parallel implementation for MCMC-based Bayesian phylogenetic inference. Bayesian phylogenetic inference is computationally expensive both in time and in memory requirements. Our variations on MCMC and their implementation were done to permit the study of large phylogenetic problems. In our approach, we can distribute either entire chains or parts of a chain to different processors, since in current models the columns of the data are independent. Evaluations ona 32-node Beowulf cluster suggest the problem scales well. A number of important points are identified, including a superlinear speedup due to more effective cache usage and the point at which additional processors slow downthe process due to communication overhead."
}
@Article{CFerr2003,
author = "C. Ferrari and C. Guerra and G. Zanotti",
title = "A grid-aware approach to protein structure comparison",
journal = "Journal of Parallel and Distributed Computing",
volume = "63",
number = "7-8",
pages = "728--737",
month = JUL-AUG,
year = "2003",
abstract = "This paper concentrates on the grid implementation of software tools for the comparison of protein structures. We have developed comparison algorithmsbased on indexing techniques that store transformation invariant properties of the 3D protein structures into tables. The method has large memory requirements and is computationally intensive. Furthermore, the dataset needs frequent updates as new proteins are added to the Protein Data Bank. Thus asignificant advantage is obtained from a computational framework such as agrid. We report on a distributed implementation of the matching procedureson a grid using Globus MPI-CH, focusing on the data partition strategy to achieve good load balancing and to minimize the number of secondary memory accesses of the out-of-core computation."
}
@Article{YHuan2003,
author = "Y. Huang and H. G. Sung and S. Y. Hsieh and V. G. Yang",
title = "Large-eddy simulation of combustion dynamics of lean-premixed swirl-stabilized combustor",
journal = "Journal of Propulsion and Power",
volume = "19",
number = "5",
pages = "782--794",
month = SEP-OCT,
year = "2003",
abstract = "A comprehensive numerical study of the combustion dynamics in a lean-premixed swirl-stabilized combustor is described. The analysis treats the conservation equations in three dimensions and takes into account finite-rate chemical reactions and variable thermophysical properties. Turbulence closure is achieved using a large-eddy-simulation technique. The compressible-flow version of the Smagorinsky model is employed to describe subgrid-scale turbulent motions and their effect on large-scale structures. A level-set flamelet library approach is used to simulate premixed turbulent combustion. The governing equations and the associated boundary conditions are solved by means of a four-step Runge-Kutta scheme along with implementation of the message passing interface parallel computing architecture. The analysis allows for a detailed investigation into the interaction between turbulent flow motions and oscillatory combustion of a swirl-stabilized combustor. Several physical processes responsible for driving combustion instabilities in the chamber have been identified and quantified, including the mutual coupling between acoustic wave motions, vortex shedding, and flame oscillations. In particular, the mechanisms of energy transfer from chemical reactions in theflame zone to acoustic motions in the bulk of chamber are carefully studied."
}
@Article{JHara2003,
author = "J. Har and R. E. Fulton",
title = "A parallel finite element procedure for contact-impact problems",
journal = "Engineering with Computers",
volume = "19",
number = "2-3",
pages = "67--84",
month = "",
year = "2003",
abstract = "An efficient parallel finite element procedure for contact-impact problems is presented within the framework of explicit finite element analysis with the penalty method. The procedure concerned includes a parallel Belytschko-Lin-Tsay shell element generation algorithm and a parallel contact-impact algorithm based on the master-slave slideline algorithm. An element-wise domain decomposition strategy and a communication minimization strategy are featured to achieve almost perfect load balancing among processors and to show scalability of the parallel performance. Throughout this work, a prototype code, named GT-PARADYN, is developed on the IBM SP2 to implement the procedure presented, under message-passing paradigm. Some examples are providedto demonstrate the timing results of the algorithms, discussing the accuracy and efficiency of the code."
}
@Article{AGupt2003,
author = "A. Gupta and R. Ganguly and S. Chakraborty and C. Mazumdar and D. Popovic",
title = "Simulating thermal power plant processes on a message passing environment",
journal = "Isa Transactions",
volume = "42",
number = "4",
pages = "615--630",
month = OCT,
year = "2003",
abstract = "Simulators play a very important role in the operation of thermal power plants and also in the design of control systems for these plants. To cater tothis requirement elaborate methodologies have been developed to simulate thermal power plant processes in an interactive way. Due to the intensive computations involved, such simulators use one or more, high through-put computers known as the simulation computers. This paper puts forward a method where parallel processing on a low latency message passing environment has been used to simulate thermal power plant processes following a modular approach. This eliminates the need of an expensive high through-put simulation computer, thus cutting down the hardware cost associated with a simulator and increasing the system reliability manifold. "
}
@Article{MBeck2003,
author = "M. Becka and G. Oksa",
title = "On variable blocking factor in a parallel dynamic block-{J}acobi {SVD} algorithm",
journal = "Parallel Computing",
volume = "29",
number = "9",
pages = "1153--1174",
month = SEP,
year = "2003",
abstract = "The parallel two-sided block-Jacobi singular value decomposition (SVD) algorithm with dynamic ordering, originally proposed in [Parallel Comput. 28 (2002) 243-262], has been extended with respect to the blocking factor. Unlike the unique blocking factor l = 2p in the original algorithm running on p processors, the current blocking factor is a variable parameter that coversthe values in two different regions-namely, l = p/k and l = 2kp for some integer k. Two new parallel two-sided block-Jacobi SVD algorithms with dynamic ordering are described in detail. They arise in those two regions and differ in the logical data arrangement and communication complexity of the reordering step. For the case of l = 2kp, it is proved that a designed point-to-point communication algorithm is optimal with respect to the amount of communication required per processor as well as to the amount of overall communication. Using the message passing programming model for distributed memory machines, new parallel block-Jacobi SVD algorithms were implemented on an SGI-Cray Origin 2000 parallel computer. Numerical experiments were performed on p = 12 and 24 processors using a set of six matrices of order 4000 and blocking factors l, 2 less than or equal to l less than or equal to 192. To achieve the minimal total parallel execution time, the use of a blocking factor l is an element of {2,p,2p} can be recommended for matrices with distinct singular values. However, for matrices with a multiple minimal singular value, the total parallel execution time may monotonically increase with l. In this case, the recommended Jacobi method with l = 2 is just the ScaLAPACK routine with some additional matrix multiplications, and it computes the SVD in one parallel iteration step. "
}
@Article{IHida2003,
author = "I. Hidajat and M. Singh and K. K. Mohanty",
title = "Nmr response of porous media by random walk algorithm: {A} parallel implementation",
journal = "Chemical Engineering Communications",
volume = "190",
number = "12",
pages = "1661--1680",
month = DEC,
year = "2003",
abstract = "NMR well logging is a popular tool in the petroleum industry to estimate porosity, specific surface area, and permeability of porous media. In this study, a random walk algorithm is used to simulate the NMR response of porous, water-saturated media, which, in turn, probes the relation between microstructure and transport. The serial implementation of the random walk algorithm is computationally very intensive for large porous samples. A parallel random walk code is developed using Message Passing Interface (MPI) in Fortran. Various domain decomposition techniques are implemented. The walker distribution across processors without domain decomposition gives the best speedup. The domain decomposition with overlapped layers requires smaller processor memory. Increasing the overlap between adjacent domains lowers the interprocessor communication and leads to improved speedup. For the given parameters, an overlap of two layers was found to be optimal. Domain decomposition along the z direction was found to be more effective than decomposition along either the x or y direction. By using the parallel random walk code, we are able to solve a 256x256x256 voxel system in less than 8 h using 32 processors on an IBM SP2 machine."
}
@Article{HHaba2003,
author = "H. Habata and M. Yokokawa and S. Kitawaki",
title = "The development of the {E}arth {S}imulator",
journal = "Ieice Transactions on Information and Systems",
volume = "E86D",
number = "10",
pages = "1947--1954",
month = OCT,
year = "2003",
abstract = {The Earth Simulator (ES), developed by the Japanese governinent's initiative "Earth Simulator project," is a highly parallel vector supercomputer system. In May 2002, the ES was proven to be the most powerful computer in the world by achieving 35.86 teraflops oil the UNPACK benchmark and 26.58 teraflops for a global atmospheric circulation model with the spectral method. Three architectural features enabled these great achievements; vector processor, shared-memory and high-bandwidth noli-blocking interconnection crossbar network. In this paper, all overview of the ES, the three architectural features and the result of performance evaluation are described particularlywith its hardware realization of the interconnection among 640 processor nodes.}
}
@Article{ATChr2003,
author = "A. T. Chronopoulos and D. Grosu and A. M. Wissink and M. Benche and J. Y. Liu",
title = "An efficient 3{D} grid based scheduling for heterogeneous systems",
journal = "Journal of Parallel and Distributed Computing",
volume = "63",
number = "9",
pages = "827--837",
month = SEP,
year = "2003",
abstract = "The cost/performance ratio of networks of workstations has been constantly improving. This trend is expected to continue in the near future. The aggregate peak rate of such systems often matches or exceeds the peak rate offered by the fastest parallel computers. This has motivated research toward using a network of computers, interconnected via a fast network (cluster system) or a simple Local Area Network (LAN) (distributed system), for high performance concurrent computations. Some of the important research issues arise such as (i) Problem partitioning and virtual interconnection topology mapping; (ii) Execution scheduling and load balancing. Past results exist forgrid partitioning (into subdomains) and mapping to parallel and distributed systems. In our work we consider the problem of grid partitioning of a 3Ddomain arising in aircraft CFD simulations in order to schedule tasks for load balanced execution on a heterogeneous distributed system. This problemhas additional restrictions on how to partition the grid. Past work for this problem were on parallel systems with only few processor configurations.We derive heuristic algorithms for: (1) homogeneous systems with any number of processors; (2) heterogeneous systems taking into account the processor speed and memory capacity. We implement our algorithms on a dedicated network of workstations (using MPI) and test them with a CFD simulation code (TURNS-Transonic Unsteady Rotor Navier Stokes)."
}
@Article{JSVet2003,
author = "J. S. Vetter and F. Mueller",
title = "Communication characteristics of large-scale scientific applications for contemporary cluster architectures",
journal = "Journal of Parallel and Distributed Computing",
volume = "63",
number = "9",
pages = "853--865",
month = SEP,
year = "2003",
abstract = "This paper examines the explicit communication characteristics of several sophisticated scientific applications, which, by themselves, constitute a representative suite of publicly available benchmarks for large cluster architectures. By focusing on the message passing interface (MPI) and by using hardware counters on the microprocessor, we observe each application's inherent behavioral characteristics: point-to-point and collective communication, and floating-point operations. Furthermore, we explore the sensitivities of these characteristics to both problem size and number of processors. Ouranalysis reveals several striking similarities across our diverse set of applications including the use of collective operations, especially those collectives with very small data payloads. We also highlight a trend of novelapplications parting with regimented, static communication patterns in favor of dynamically evolving patterns, as evidenced by our experiments on applications that use implicit linear solvers and adaptive mesh refinement. Overall, our Study contributes a better understanding of the requirements of current and emerging paradigms of scientific computing ill terms of their computation and communication demands. "
}
@Article{FvanH2003,
author = "F. van Hees and A. J. Van der Steen and P. J. van Leeuwen",
title = "A parallel data assimilation model for oceanographic observations",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "15",
number = "13",
pages = "1191--1204",
month = NOV,
year = "2003",
abstract = "In this paper we describe the development of a program that aims at achieving the optimal integration of observed data in an oceanographic model describing the water transport phenomena in the Agulhas area at the tip of SouthAfrica. Two parallel implementations, MPI and OpenMP, are described and experiments with respect to speed and scalability on a Compaq AlphaServer SC and an SGI Origin3000 are reported. "
}
@Article{MBern2003,
author = "M. Bernaschi and G. Iannello and M. Lauria",
title = "Efficient implementation of reduce-scatter in {MPI}",
journal = "Journal of Systems Architecture",
volume = "49",
number = "3",
pages = "89--108",
month = AUG,
year = "2003",
abstract = "Our study has been carried out on two different platforms: an SP2 and a Myrinet interconnected cluster of Pentium PRO. However, most of the results reported here are not specific for either MPI or the platforms used, and theyhold in general for any message passing programming system."
}
@Article{FJSei2003,
author = "F. J. Seinstra and D. Koelma",
title = "Incorporating memory layout in the modeling of message passing programs",
journal = "Journal of Systems Architecture",
volume = "49",
number = "3",
pages = "109--121",
month = AUG,
year = "2003",
abstract = "The effectiveness of the model is tested in a framework for automatic parallelization of image processing applications. Experiments are performed on two Beowulf-type commodity clusters, each having a different interconnectionnetwork, and a different MPI implementation. Results show that, where other models frequently fail, P-3PC correctly predicts the communication costs related to any type of domain decomposition."
}
@Article{LWLia2003,
author = "L. W. Li and Y. J. Wang and E. P. Li",
title = "M{PI}-based parallelized precorrected {FFT} algorithm for analyzing scattering by arbitrarily shaped three-dimensional objects - {A}bstract",
journal = "Journal of Electromagnetic Waves and Applications",
volume = "17",
number = "10",
pages = "1489--1491",
month = "",
year = "2003",
abstract = ""
}
Parallel unsteady incompressible viscous flow computations using an
unstructured multigrid method
Tai, CH, JOURNAL OF COMPUTATIONAL PHYSICS, NOV 20 2003, 192, 1, pp.
277-311.

Tai, CH;Zhao, Y

Nanyang Technol Univ/Singapore/Singapore/639798
ANL libraries: 203

Abstract:
The development and validation of a parallel unstructured
non-nested multigrid method for simulation of unsteady
incompressible viscous flow is presented. The Navier-Stokes
solver is based on the artificial compressibility approach and a
higher-order characteristics-based finite-volume scheme on an
unstructured multigrid. Unsteady flow is calculated with an
implicit dual time-stepping scheme. The parallelization of the
solver is achieved by multigrid domain decomposition approach
(MG-DD), using the single program multiple data (SPMD)
programming paradigm and message-passing interface (MPI)
forcommunication of data. The parallel codes using single grid
and multigrid are used to simulate steady and unsteady
incompressible viscous flows over a circular cylinder for
validation and performance evaluation purposes. Thespeedups and
parallel efficiencies obtained by both the parallel single grid
and multigrid solvers are reasonably good for both test cases,
using up to 32 processors on the SGI Origin 2000. A maximum
speedup of 12 could be achieved on 16 processors for
high-Reynolds number unsteady viscous flow. The parallel results
obtained were compared with those using serial single grid and
multigrid codes and it remains the same as those obtained by
serial solvers and agrees well with numerical solutions obtained
by other researchers as well as experimental measurements.

@Article{NKozi2003,
author = "N. Koziris and A. Sotiropoulos and G. Goumas",
title = "A pipelined schedule to minimize completion time for loop tiling with computation and communication overlapping",
journal = "Journal of Parallel and Distributed Computing",
volume = "63",
number = "11",
pages = "1138--1151",
month = NOV,
year = "2003",
abstract = "This paper proposes a new method for the problem of minimizing the execution time of nested for-loops using a tiling transformation. In our approach, we are interested not only in tile size and shape according to the requiredcommunication to computation ratio, but also in overall completion time. We select a time hyperplane to execute different tiles much more efficientlyby exploiting the inherent overlapping between communication and computation phases among successive, atomic tile executions. We assign tiles to processors according to the tile space boundaries, thus considering the iteration space bounds. Our schedule considerably reduces overall completion time under the assumption that some part from every communication phase can be efficiently overlapped with atomic, pure tile computations. The overall schedule resembles a pipelined datapath where computations are not anymore interleaved with sends and receives to nonlocal processors. We survey the application of our schedule to modern communication architectures. We performed two sets of experimental results, one using MPI primitives over FastEthernet and one using the SISCI API over an SCI network. In both cases, the totalcompletion time is significantly reduced."
}
@Article{MLian2003,
author = "M. Li and D. W. Walker and O. F. Rana and Y. Huang and P. T. Williams and R. C. Ward",
title = "Engineering high-performance legacy codes as {CORBA} components for problem-solving environments",
journal = "Journal of Parallel and Distributed Computing",
volume = "63",
number = "11",
pages = "1152--1163",
month = NOV,
year = "2003",
abstract = "This paper describes techniques used to leverage high-performance legacy codes as CORBA components to a distributed problem-solving environment. It first briefly introduces the software architecture adopted by the environment. Then it presents a CORBA oriented wrapper generator (COWG) which can be used to automatically wrap high-performance legacy codes as CORBA components. Two legacy codes have been wrapped with COWG. One is an MPI-based molecular dynamic simulation (MDS) code, the other is a finite element-based computational fluid dynamics (CFD) code for simulating incompressible Navier-Stokes flows. Performance comparisons between runs of the MDS CORBA component and the original MDS legacy code on a cluster of workstations and on a parallel computer are also presented. Wrapped as CORBA components, these legacycodes can be reused in a distributed computing environment. The first caseshows that high-performance can be maintained with the wrapped MDS component. The second case shows that a Web user can submit a task to the wrapped CFD component through a Web page without knowing the exact implementation of the component. In this way, a user's desktop computing environment can beextended to a high-performance computing environment using a cluster of workstations or a parallel computer."
}
@Article{YChen2003,
author = "Y. Cheng and F. S. Lien and E. Yee and R. Sinclair",
title = "A comparison of large {E}ddy simulations with a standard k-epsilon {R}eynolds-averaged {N}avier-{S}tokes model for the prediction of a fully developed turbulent flow over a matrix of cubes",
journal = "Journal of Wind Engineering and Industrial Aerodynamics",
volume = "91",
number = "11",
pages = "1301--1328",
month = NOV,
year = "2003",
abstract = "A fully developed turbulent flow over a matrix of cubes has been studied using the large Eddy simulation (LES) and Reynolds-averaged Navier-Stokes (RANS) [more specifically, the standard k-epsilon model] approaches. The numerical method used in LES of an incompressible fluid flow was a second-order accurate, fully conservative discretization scheme. This scheme was used inconjunction with a dynamic semi-coarsening multigrid method applied on a staggered grid as proposed originally by Ham et al. (Proceedings of the Seventh Annual Conference of the Computational Fluid Dynamics Society of Canada, Halifax, Nova Scotia, Canada, 1999; J. Comput. Phys. 177 (2002) 117). Theeffects of the unresolved subgrid scales in LES are modeled using three different subgrid-scale models: namely, the standard Smagorinsky model; the dynamic model with time-averaging procedure (DMT); and, the localized dynamic model (LDM). To reduce the computational time, LES calculations were conducted on a Linux-based PC cluster using the message passing interface library. RANS calculations were performed using the STREAM code of Lien and Leschziner (Comp. Meth. Appl. Mech. Eng. 114 (1994) 123). The Reynolds number for the present flow simulations, based on the mean bulk velocity and the cube height, was 3800 which is in accordance with the experimental data of Meinders (Ph.D. Thesis, Faculty of Applied Sciences, Delft University of Technology, Delft, Netherlands, 1998). A comparison of predicted model results for mean flow and turbulence with the corresponding experimental data showed that both the LES and RANS approaches were able to predict the main characteristics of the mean flow in the array of cubes reasonably well. LES, particularly when used with LDM, was found to perform much better than RANS interms of its predictions of the spanwise mean velocity and Reynolds stresses. Flow structures in the proximity of a cube, such as separation at the sharp leading top and side edges of the cube, recirculation in front of the cube, and the arch-type vortex in the wake are captured by both the LES andRANS approaches. However, LES was found to give a better overall quantitative agreement with the experimental data than RANS."
}
@Article{JChee2003,
author = "J. Cheetham and F. Dehne and A. Rau-Chaplin and U. Stege and P. J. Taillon",
title = "Solving large {FPT} problems on coarse-grained parallel machines",
journal = "Journal of Computer and System Sciences",
volume = "67",
number = "4",
pages = "691--706",
month = DEC,
year = "2003",
abstract = "Fixed-parameter tractability (FPT) techniques have recently been successfulin solving NP-complete problem instances of practical importance which were too large to be solved with previous methods. In this paper, we show how to enhance this approach through the addition of parallelism, thereby allowing even larger problem instances to be solved in practice. More precisely,we demonstrate the potential of parallelism when applied to the bounded-tree search phase of FPT algorithms. We apply our methodology to the k-VERTEXCOVER problem which has important applications in, for example, the analysis of multiple sequence alignments for computational biochemistry. We have implemented our parallel FPT method for the k-VERTEX COVER problem using C and the MPI communication library, and tested it on a 32-node Beowulf cluster. This is the first experimental examination of parallel FPT techniques. As part of our experiments, we solved larger instances of k-VERTEX COVER than in any previously reported implementations. For example, our code can solve problem instances with kgreater than or equal to400 in less than 1.5 h."
}
@Article{FWolf2003,
author = "F. Wolf and B. Mohr",
title = "Automatic performance analysis of hybrid {MPI}/{O}pen{MP} applications",
journal = "Journal of Systems Architecture",
volume = "49",
number = "10-11",
pages = "421--439",
month = NOV,
year = "2003",
abstract = "The EXPERT performance-analysis environment provides a complete tracing-based solution for automatic performance analysis of MPI, OpenMP, or hybrid applications running on parallel computers with SMP nodes. EXPERT describes performance problems using a high level of abstraction in terms of executionpatterns that result from an inefficient use of the underlying programmingmodel(s). The set of predefined problems can be extended to meet application-specific needs. The analysis is carried out along three interconnected dimensions: class of performance behavior, call tree, and thread of execution. Each dimension is arranged in a hierarchy so that the user can investigate the behavior on varying levels of detail. All three dimensions are interactively accessible using a single integrated view. "
}
@Article{FChan2003,
author = "F. Chan and J. N. Cao and Y. D. Sun",
title = "High-level abstractions for message-passing parallel programming",
journal = "Parallel Computing",
volume = "29",
number = "11-12",
pages = "1589--1621",
month = NOV-DEC,
year = "2003",
abstract = "Large-scale scientific and engineering computation problems are usually complex and consequently the development of parallel programs for solving these problems is a difficult task. In this paper, we describe the graph-oriented programming (GOP) model and environment for building and evaluating parallel applications. The GOP model provides higher level abstractions for message-passing parallel programming and the software environment offers toolswhich can ease programmers for parallelizing, writing, and deploying scientific and engineering computing applications. We discuss the motivations and various issues in developing the model and the software environment, present the design of the system architecture and the components, and describe the evaluation of the environment implemented on top of MPI with a sample parallel scientific application program. With the support of the high-level abstractions provided by the proposed GOP environment, programming of parallel applications on various parallel architectures can be greatly simplified. "
}
@Article{MJMar2003,
author = "M. J. Martin and D. E. Singh and J. C. Mourino and F. F. Rivera and R. Doallo and J. D. Bruguera",
title = "High performance air pollution modeling for a power plant environment",
journal = "Parallel Computing",
volume = "29",
number = "11-12",
pages = "1763--1790",
month = NOV-DEC,
year = "2003",
abstract = "The aim of this work is to provide a high performance air quality simulation using the STEM-II (Sulphur Transport Eulerian Model 2) program, a large-scale pollution modeling application. First, we optimize the sequential program with the aim of increasing data locality. Then, we parallelized the program using OpenMP directives for shared memory systems, and the MPI libraryfor distributed memory machines. Performance results are presented for a SGI O2000 multiprocessor, a Fujitsu AP3000 multicomputer and a Cluster of PCs. Experimental results show that the parallel versions of the code achieveimportant reductions in the CPU time needed by each simulation. This will allow us to obtain results with adequate speed and reliability for the industrial environment where it is intended to be applied."
}
@Article{BVRKu2003,
author = "B. V. R. Kumar and A. Quateroni and L. Formaggia and D. Lamponi",
title = "On parallel computation of blood flow in human arterial network based on 1-{D} modelling",
journal = "Computing",
volume = "71",
number = "4",
pages = "321--351",
month = NOV,
year = "2003",
abstract = "In this study, parallel computation of blood flow in a 1-D model of human arterial network has been carried out employing a Taylor Galerkin Finite Element Method. Message passing interface libraries have been used on Origin 2000 SGI machine. A Greedy strategy for load-distribution has been devised and data-flow graphs necessary for parallelization have been generated. The performance of parallel implementation measured in terms of speedup and efficiency factors is found to be good. Further, the parallel code is used in simulating the propagation of pressure and velocity waveforms in our 1-D arterial model for two different inflow pressure pulses. Also, the influence of consideration of terminal resistance on pressure and velocity waveforms have been analyzed."
}
@Article{ABalo2003,
author = "A. Baloch and M. F. Webster",
title = "Distributed parallel computation for complex rotational flows of non-{N}ewtonian fluids",
journal = "International Journal for Numerical Methods in Fluids",
volume = "43",
number = "10-11",
pages = "1301--1328",
month = DEC,
year = "2003",
abstract = "Complex rotational flows of non-Newtonian fluids are simulated through finite element methods. The predictions have direct relevance to dough kneading, associated with the food industry. The context is taken as two-dimensional and one of stirring material within a cylindrical vessel. Three stirrer shapes are considered, placed in eccentric location with respect to the cylinder centre. The motion is driven by the rotation of the outer vessel wall.Variation with change in theology and change in stirrer shapes are analysed, with respect to flow kinematics, stress fields, rate-of-work and power consumed. Computations are performed for Newtonian, shear-thinning and viscoelastic fluids, at various viscosity levels to gradually approximate more realistic dough-like response. For viscoelastic fluids, Phan-Thien/Tanner constitutive models are adopted. The numerical method employed is based on a finite element semi-implicit time-stepping Taylor-Galerkin/pressure-correction scheme, posed in a cylindrical polar co-ordinate system. Simulations are conducted via distributed parallel processing, performed on a networked cluster of workstations, employing message passing. Parallel performance timings are compared against those obtained working in sequential mode. Ideal linear speed-up with the number of processors is observed for viscoelastic flows under this coarse-grained implementation. "
}
@Article{YIwam2003,
author = "Y. Iwamoto and K. Suga and K. Ootsu and T. Yokota and T. Baba",
title = "Receiving message prediction method",
journal = "Parallel Computing",
volume = "29",
number = "11-12",
pages = "1509--1538",
month = NOV-DEC,
year = "2003",
abstract = "This paper proposes and evaluates the Receiving Message Prediction Method for high performance message passing. In this method, a node in the idle state predicts the next message reception, and speculatively executes the message reception and user processes. This method is independent of underlying computer architecture and message passing libraries. We propose the algorithms for the message prediction, and evaluate them from the viewpoint of thesuccess ratio and speed-ups. We use the NAS parallel benchmark programs astypical parallel applications running on two different types of parallel platforms, i.e., a workstation cluster and a shared memory multiprocessor. The experimental results show that the method can be applied to various platforms. The method can also be implemented just by changing the software inside their message passing libraries without any support from the underlyingsystem software or hardware. This mean that we do not require any change of application software that uses the libraries. The application of the method to the message passing interface libraries achieves a speed-up of 6.8\% for the NAS Parallel Benchmarks, and the static and dynamic selection of prediction methods based on profiling results improve the performance. "
}
@Article{JSmit2003,
author = "J. Smith and A. Gounaris and P. Watson and N. W. Paton and A. A. A. Fernandes and R. Sakellariou",
title = "Distributed query processing on the grid",
journal = "International Journal of High Performance Computing Applications",
volume = "17",
number = "4",
pages = "353--367",
month = WIN,
year = "2003",
abstract = "Distributed query processing (DQP) has been widely used in data intensive applications where data of relevance to users are stored at multiple locations. This paper argues: (i) that DQP can be important in the Grid, as a means of providing high-level, declarative languages for integrating data access and analysis; and (ii) that the Grid provides resource management facilities that are useful to developers of DQP systems. As well as discussing andillustrating how DQP technologies can be deployed within the Grid, the paper describes Polar*, a prototype implementation of a DQP system running over Globus. Polar* can handle complex data by adopting the ODMG object model and its query language OQL, which supports the invocation of user-defined operations. The Globus components are accessed through the MPICH-G interfacerather than in a lower level way. A case study from bioinformatics is usedthroughout the paper, to show the benefits of the approach."
}

@Article{RAFor2004,
author = "R. A. Forster and L. J. Cox and R. F. Barrett and T. E. Booth and J. F. Briesmeister and Brown",
title = "{MCNP} ({TM}) {V}ersion 5",
journal = "Nuclear Instruments \& Methods in Physics Research Section B-Beam Interactions with Materials and Atoms",
volume = "213",
number = "",
pages = "82--86",
month = JAN,
year = "2004",
abstract = "The Monte Carlo transport workhorse, MCNP [Los Alamos National Laboratory report LA-13709-M, 2000], is undergoing a massive renovation at Los Alamos National Laboratory (LANL) in support of the Eolus Project of the Advanced Simulation and Computing (ASCI) Program. MCNP 1 Version 5 (V5) (expected to be released to RSICC in Fall 2002) will consist of a major restructuring from FORTRAN-77 (with extensions) to ANSI-standard FORTRAN90 [American National Standard for Programming Language - Fortran-Extended, ANSI X3. 198-1992,1992] with support for all of the features available in the present release (MCNP-4C2/4C3). To most users, the look-and-feel of MCNP will not change much except for the improvements (improved graphics, easier installation, better online documentation). For example, even with the major format change, full support for incremental patching will still be provided. In additionto the language and style updates, MCNP V5 will have various new user features. These include improved photon physics, neutral particle radiography, enhancements and additions to variance reduction methods, new source options, improved parallelism support (PVM, MPI, OpenMP), and new nuclear and atomic data libraries."
}

@Article{GBron2003,
author = "G. Bronevetsky and D. Marques and K. Pingali and P. Stodghill",
title = "Automated application-level checkpointing of {MPI} programs",
journal = "ACM Sigplan Notices",
volume = "38",
number = "10",
pages = "84--94",
month = OCT,
year = "2003",
abstract = "We then present a suitable protocol, which is implemented by a co-ordination layer that sits between the application program and the MPI library. We show how this protocol can be used with a precompiler that instruments C/MPIprograms to save application and MPI library state. An advantage of our approach is that it is independent of the MPI implementation. We present experimental results that argue that the overhead of using our system can be small."
}
@Article{AKarw2003,
author = "A. Karwande and X. Yuan and D. K. Lowenthal",
title = "C{C}-{MPI}: {A} compiled communication capable {MPI} prototype for ethernet switched clusters",
journal = "ACM Sigplan Notices",
volume = "38",
number = "10",
pages = "95--106",
month = OCT,
year = "2003",
abstract = "Compiled communication has recently been proposed to improve communication performance for clusters of workstations. The idea of compiled communication is to apply more aggressive optimizations to communications whose information is known at compile time. Existing MPI libraries do not support compiled communication. In this paper, we present an MPI prototype, CC-MPI, that supports compiled communication on Ethernet switched clusters. The unique feature of CC-MPI is that it allows the user to manage network resources such as multicast groups directly and to optimize communications based on the availability of the communication information. CC-MPI optimizes one-to-all,one-to-many, all-to-all, and many-to-many collective communication routines using the compiled communication technique. We describe the techniques used in CC-MPI and report its performance. The results show that communication performance of Ethernet switched clusters can be significantly improved through compiled communication."
}
@Article{SJDei2003,
author = "S. J. Deitz and B. L. Chamberlain and S. E. Choi and L. Snyder",
title = "The design and implementation of a parallel array operator for the arbitrary remapping of data",
journal = "ACM Sigplan Notices",
volume = "38",
number = "10",
pages = "154--165",
month = OCT,
year = "2003",
abstract = "Gather and scatter are data redistribution functions of long-standing importance to high performance computing. In this paper, we present a highly-general array operator with powerful gather and scatter capabilities unmatchedby other array languages. We discuss an efficient parallel implementation,introducing three new optimizations-schedule compression, dead array reuse, and direct communication-that reduce the costs associated with the operator's wide applicability. In our implementation of this operator in ZPL, we demonstrate performance comparable to the hand-coded Fortran + MPI versionsof the NAS FT and CC benchmarks."
}
@Article{SSaun2003,
author = "S. Saunders and L. Rauchwerger",
title = "A{RMI}: {A}n adaptive, platform independent communication library",
journal = "ACM Sigplan Notices",
volume = "38",
number = "10",
pages = "229--240",
month = OCT,
year = "2003",
abstract = "ARMI is a communication library that provides a framework for expressing fine-grain parallelism and mapping it to a particular machine using shared-memory and message passing library calls. The library is an advanced implementation of the RMI protocol and handles low-level details such as schedulingincoming communication and aggregating outgoing communication to coarsen parallelism when necessary. These details can be tuned for different platforms to allow user codes to achieve the highest performance possible without manual modification. ARMI is used by STAPL, our generic parallel library, to provide a portable, user transparent communication layer, We present the basic design as well as the mechanisms used in the current Pthreads/OpenMP,MPI implementations and/or a combination thereof. Performance comparisons between ARMI and explicit use of Pthreads or MPI are given on a variety of machines, including an HP V2200, SGI Origin 3800, IBM Regatta-HPC and IBM RS6000 SP cluster."
}
@Article{GRLue2004,
author = "G. R. Luecke and M. Kraeva and J. Yuan and S. Spanoyannis",
title = "Performance and scalability of {MPI} on {PC} clusters",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "16",
number = "1",
pages = "79--107",
month = JAN,
year = "2004",
abstract = "The purpose of this paper is to compare the communication performance and scalability of MPI communication routines on a Windows Cluster, a Linux Cluster, a Cray T3E-600, and an SGI Origin 2000. All tests in this paper were run using various numbers of processors and two message sizes. In spite of the fact that the Cray T3E-600 is about 7 years old, it performed best of all machines for most of the tests. The Linux Cluster with the Myrinet interconnect and Myricom's MPI performed and scaled quite well and, in most cases, performed better than the Origin 2000, and in some cases better than the T3E. The Windows Cluster using the Giganet Full Interconnect and MPI/Pro's MPI performed and scaled poorly for small messages compared with all of theother machines."
}
@Article{ZYina2004,
author = "Z. Yin and H. J. H. Clercx and D. C. Montgomery",
title = "An easily implemented task-based parallel scheme for the {F}ourier pseudospectral solver applied to 2{D} {N}avier-{S}tokes turbulence",
journal = "Computers \& Fluids",
volume = "33",
number = "4",
pages = "509--520",
month = MAY,
year = "2004",
abstract = "An efficient parallel scheme is proposed for performing direct numerical simulation (DNS) of two-dimensional Navier-Stokes turbulence at high Reynoldsnumbers. We illustrate, the resulting numerical code by displaying relaxation to states close to those that have been predicted by statistical-mechanical methods which start from ideal (Euler) fluid mechanics. The validationof these! predictions by DNS requires unusually long computation times on single-cpu workstations, and suggests the use of parallel computation. The performance of our MPI Fortran 90 code on the SGI Origin 3800 is reported, together with its comparison with another parallel method. A few computational results that illustrate tests of the statistical-mechanical predictionsare presented. "
}
@Article{LWLia42,
author = "L. W. Li and Y. J. Wang and E. P. Li",
title = "M{PI}-based parallelized precorrected {FFT} algorithm for analyzing scattering by arbitrarily shaped three-dimensional objects",
journal = "Electromagnetic Waves",
year = "2003",
volume = "42",
pages = "247--259",
abstract = "(none available)"
}
@Article{PThul2004,
author = "P. Thulasiraman and A. A. Khokhar and G. Heber and G. R. Gao",
title = "A fine-grain load-adaptive algorithm of the 2{D} discrete wavelet transform for multithreaded architectures",
journal = "Journal of Parallel and Distributed Computing",
volume = "64",
number = "1",
pages = "68--78",
month = JAN,
year = "2004",
abstract = "In this paper we develop a load-adaptive multithreaded algorithm to compute2D Discrete Wavelet Transform (DWT) and its implementation on a fine-grainmultithreading platform. In a 2D DWT computation, the problem sizes reduces at every decomposition level and the length of the emerging computation paths also vary. The parallel algorithm proposed in this paper, dynamically scales itself to the varying problem size. During any iteration, the ratio of the number of local threads to the number of remote threads issued by a processor can be adjusted to be greater than I by controlling the algorithmparameters. This approach provides an opportunity to interleave computation and communication without explicitly introducing idle cycles on waiting for the remote threads to finish. Experimental results are reported based onthe implementations of the proposed algorithm on a 20 node emulated multithreaded platform, EARTH-MANNA, specifically designed for fine-grain multithreaded paradigms. We show that multithreading implementations of the proposed algorithm are at least 2 times faster than the MPI-based message passingimplementations reported in the literature, assuming the same processor speed. We further show that the proposed algorithm and implementations scale linearly with respect to problem and machine sizes. "
}
@Article{JDubi2004,
author = "J. Dubinski and J. Kim and C. Park and R. Humble",
title = "G{OTPM}: a parallel hybrid particle-mesh treecode",
journal = "New Astronomy",
volume = "9",
number = "2",
pages = "111--126",
month = FEB,
year = "2004",
abstract = "We describe a parallel, cosmological N-body code based on a hybrid scheme using the particle-mesh (PM) and Barnes-Hut (BH) oct-tree algorithm. We callthe algorithm GOTPM for Grid-of-Oct-Trees-Particle-Mesh. The code is parallelized using the Message Passing Interface (MPI) library and is optimized to run on Beowulf clusters as well as symmetric multi-processors. The gravitational potential is determined on a mesh using a standard PM method with particle forces determined through interpolation. The softened PM force is corrected for short range interactions using a grid of localized BH trees throughout the entire simulation volume in a completely analogous way to (PM)-M-3 methods. This method makes no assumptions about the local density forshort range force corrections and so is consistent with the results of the(PM)-M-3 method in the limit that the treecode opening angle parameter, theta-->0. The PM method is parallelized using one-dimensional slice domain decomposition. Particles are distributed in slices of equal width to allow mass assignment onto mesh points. The Fourier transforms in the PM method are done in parallel using the MPI implementation of the FFTW package. Parallelization for the tree force corrections is achieved again using one-dimensional slices but the width of each slice is allowed to vary according to the amount of computational work required by the particles within each slice to achieve load balance. The tree force corrections dominate the computational load and so imbalances in the PM density assignment step do not impact the overall load balance and performance significantly. The code performance scales well to 128 processors and is significantly better than competing methods. We present preliminary results from simulations run on different platforms containing up to N=1G particles to verify the code."
}
@Article{SGorl2004,
author = "S. Gorlatch",
title = "Send-receive considered harmful: {M}yths and realities of message passing",
journal = "ACM Transactions on Programming Languages and Systems",
volume = "26",
number = "1",
pages = "47--56",
month = JAN,
year = "2004",
abstract = {During the software crisis of the 1960s, Dijkstra's famous thesis "goto considered harmful" paved the way for structured programming. This short communication suggests that many current difficulties of parallel programming based on message passing are caused by poorly structured communication, whichis a consequence of using low-level send-receive primitives. We argue that, like goto in sequential programs, send-receive should be avoided as far as possible and replaced by collective operations in the setting of message passing. We dispute some widely held opinions about the apparent superiority of pairwise communication over collective communication and present substantial theoretical and empirical evidence to the contrary in the context of MPI ( Message Passing Interface).}
}
@Article{CWKes2004,
author = "C. W. Kessler",
title = "Managing distributed shared arrays in a bulk-synchronous parallel programming environment",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "16",
number = "2-3",
pages = "133--153",
month = FEB-MAR,
year = "2004",
abstract = "NestStep is a parallel programming language for the BSP (bulk-hronous parallel) programming model. In this article we describe the concept of distributed shared arrays in NestStep and its implementation on top of MPI. In particular, we present a novel method for runtime scheduling of irregular, direct remote accesses to sections of distributed shared arrays. Our method, which is fully parallelized, uses conventional two-sided message passing and thus avoids the overhead of a standard implementation of direct remote memory access based on one-sided communication. The main prerequisite is that the given program is structured in a BSP-compliant way. "
}
@Article{SBenk2004,
author = "S. Benkner and T. Brandes",
title = "Compiling data-parallel programs for clusters of {SMP}s",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "16",
number = "2-3",
pages = "111--132",
month = FEB-MAR,
year = "2004",
abstract = "Clusters of shared-memory multiprocessors (SMPs) have become the most promising parallel computing platforms for scientific computing. However, SNIP clusters significantly increase the complexity of user application development when using the low-level application programming interfaces MPI and OpenMP, forcing users to deal with both distributed-memory and shared-memory parallelization details. In this paper we present extensions of High Performance Fortran (HPF) for SNIP clusters which enable the compiler to adopt a hybrid parallelization strategy, efficiently combining distributed-memory with shared-memory parallelism. By means of a small set of new language features, the hierarchical structure of SNIP clusters may be specified. This information is utilized by the compiler to derive inter-node data mappings for controlling distributed-memory parallelization across the nodes of a cluster and intra-node data mappings for extracting shared-memory parallelism within nodes. Additional mechanisms are proposed for specifying inter- and intra-node data mappings explicitly, for controlling specific shared-memory parallelization issues and for integrating OpenMP routines in HPF applications. The proposed features have been realized within the ADAPTOR and VFC compilers. The parallelization strategy for clusters of SMPs adopted by these compilers is discussed as well as a hybrid-parallel execution model based ona combination of MPI and OpenMP. Experimental results indicate the effectiveness of the proposed features. "
}
@Article{TRaub2004,
author = "T. Rauber and R. Reilein and G. Runger",
title = "Group-{SPMD} programming with orthogonal processor groups",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "16",
number = "2-3",
pages = "173--195",
month = FEB-MAR,
year = "2004",
abstract = "Many programs for message-passing machines can benefit from an implementation in a group-SPMD programming model due to the potential to reduce communication overhead and to increase scalability. In this paper, we consider group-SPMD programs exploiting different orthogonal processor partitions in one program. For each program this is a fixed set of predefined processor partitions given by the parallel hyperplanes of a two- or multi-dimensional virtual processor organization. We introduce a library built on top of MPI tosupport the programming with those orthogonal processor groups. The parallel programming model is appropriate for applications with a multi-dimensional task grid and task dependencies mainly aligned in the dimensions of the task grid. The library can be used to specify the appropriate processor partitions, which are then created by the library, and to define the mapping of tasks to the processor hyperplanes. Examples from numerical analysis illustrate the programming style and show that the runtime on distributed memory machines can be considerably reduced by using the library. "
}
@Article{DJQui2004,
author = "D. J. Quinlan and M. Schordan and B. Miller and M. Kowarschik",
title = "Parallel object-oriented framework optimization",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "16",
number = "2-3",
pages = "293--302",
month = FEB-MAR,
year = "2004",
abstract = "Sophisticated parallel languages are difficult to develop; most parallel distributed memory scientific applications are developed using a serial language, expressing parallelism through third party libraries (e.g. MPI). As a result, frameworks and libraries are often used to encapsulate significant complexities. We define a novel approach to optimize the use of libraries within applications. The resulting tool, named ROSE, leverages the additional semantics provided by library-defined abstractions enabling library specific optimization of application codes. It is a common perception that performance is inversely proportional to the level of abstraction. Our work shows that this is not the case if the additional semantics can be leveraged. We show how ROSE can be used to leverage the semantics within the compile-time optimization. "
}
@Article{PRao2004,
author = "P. Rao",
title = "A parallel hydrodynamic model for shallow water equations",
journal = "Applied Mathematics and Computation",
volume = "150",
number = "1",
pages = "291--302",
month = FEB,
year = "2004",
abstract = "A parallel implementation of a finite difference model for solving two-dimensional, time-dependent, open channel flows is presented. The algebraic equations resulting from the finite difference discretization of the two dimensional shallow water flow equations are solved by using explicit MacCormackscheme. The parallel code has been implemented on distributed-shared memory system, by using domain decomposition techniques. The message passing interface (MPI) protocols are incorporated for inter processor data communication. The effect Of using two different geometry partitions is investigated.A comparison of the wallclock time of the code between these two partitions is made, and code performances with respect to different number of processors are presented. "
}

@Article{PFLiu2003,
author = "P. F. Liu and K. Li",
title = "Performance analysis of a {BiCGSTAB} solver for multiple-marine-propeller simulation with several {MPI} libraries and platforms",
journal = "High Performance Scientific and Engineering Computing: Hardware/Software Support",
volume = "750",
number = "",
pages = "63--78",
month = "",
year = "2003",
abstract = ""
}

@Article{AJian2004,
author = "A. Jiang and S. Y. Shi and G. Jin and D. W. Prather",
title = "Performance analysis of three dimensional high index contrast dielectric waveguides",
journal = "Optics Express",
volume = "12",
number = "4",
pages = "633--643",
month = FEB,
year = "2004",
abstract = "This paper presents the implementation of a parallelized Finite-Difference Time-Domain method, based on the Message Passing Interface (i.e. MPI), which is used to study the modal properties of three-dimensional (3D) dielectric waveguide structures. To this end, we also use the least-square method toobtain the wave vector, beta, along the axis of propagation. Lastly, bending losses in arbitrary-angle waveguides are also discussed."
}
@Article{GAlte2004,
author = "G. Altekar and S. Dwarkadas and J. P. Huelsenbeck and F. Ronquist",
title = "Parallel metropolis coupled {M}arkov chain {M}onte {C}arlo for {B}ayesian phylogenetic inference",
journal = "Bioinformatics",
volume = "20",
number = "3",
pages = "407--415",
month = FEB,
year = "2004",
abstract = "Results: This paper presents a parallel algorithm for (MC)(3). The proposedparallel algorithm retains the ability to explore multiple peaks in the posterior distribution of trees while maintaining a fast execution time. The algorithm has been implemented using two popular parallel programming models: message passing and shared memory. Performance results indicate nearly linear speed improvement in both programming models for small and large datasets."
}
@Article{AQCui2004,
author = "A. Q. Cui and R. L. Street",
title = "Large-eddy simulation of coastal upwelling flow",
journal = "Environmental Fluid Mechanics",
volume = "4",
number = "2",
pages = "197--223",
month = JUN,
year = "2004",
abstract = "Large-eddy simulations were carried out to study laboratory-scale realizations of coastal upwelling in an annular rotating tank with a sloping bottom.A two-layer stratified fluid was set into rigid body motion with the tank and then driven by the relative rotation of a solid top lid. The simulationcode developed in this work was a three-dimensional incompressible Navier-Stokes solver using the message passing interface. The simulation runs wereperformed on a distributed memory massively parallel computer, namely, theIBM SP2. The simulation results were able to reveal the evolution of the complex upwelling structures in detail. The results were used to compare with and to complement two relevant series of coastal upwelling experiments. ARayleigh-Taylor type of instability took place in the top inversion layer due to the unstable stratification after establishment of the upwelling front. The primary upwelling front was unstable to azimuthal perturbations anddeveloped large amplitude baroclinic waves. The frontal wave structure consists of cyclone/anticyclone pairs. Whether cyclonic eddies containing the lower-layer fluid pinch off from the front depends on the theta(*) value. The non-dimensional parameter theta(*) = g(1) h(0)/u(*)flambda(s), which wasfirst introduced by Narimousa and Maxworthy, combines the effects of stratification, rotation and surface stress and can be used to characterize the upwelling flow field. Our studies show that the frontal instabilities are much more intense and the upwelling front itself displays strong unsteadiness and cyclonic eddies containing the lower-layer fluid pinch off from the front when theta(*) is significantly less than 5.8. For theta(*) = 5.8, the frontal instabilities are less intense and no pinched-off process is observed. To separate these regimes, a critical value of theta(*) of about 5.4 is consistent with Narimousa and Maxworthy's results."
}
@Article{BKalu2004,
author = "B. Kaludercic",
title = "Parallelisation of the {L}agrangian model in a mixed {E}ulerian-{L}agrangian {CFD} algorithm",
journal = "Journal of Parallel and Distributed Computing",
volume = "64",
number = "2",
pages = "277--284",
month = FEB,
year = "2004",
abstract = "This manuscript presents an algorithm implemented in a commercial computational fluid dynamics (CFD) code for parallelisation of the Lagrangian particle tracking model in a mixed Eulerian-Lagrangian CFD algorithm. The algorithm is based on the domain decomposition parallelisation strategy and asynchronous message passing protocol. The methodology is tested on two industrial CFD test cases and the parallelisation results are presented. Further, itis discussed how the parallel efficiency of the runs can be improved by adopting the domain decomposition scattering technique. "
}
@Article{ACBur2004,
author = "A. C. Burt and I. B. Celik and R. S. Gemmen and A. V. Smirnov",
title = "A numerical study of cell-to-cell variations in a {SOFC} stack",
journal = "Journal of Power Sources",
volume = "126",
number = "1-2",
pages = "76--87",
month = FEB,
year = "2004",
abstract = "A numerical investigation of cell-to-cell voltage variation is performed byconsidering the impact of flow distribution and heat transfer on a SOFC stack. The stack model used is based on a one-dimensional co-flow cell model developed in prior work. The influence of radiative heat transfer between the PEN (positive electrode, electrolyte, negative electrode body) and the neighboring separator plates on the temperature distribution is also considered. Variations in cell voltage are attributed to asymmetries in stack geometry (boundary effects) and non-uniformity in flow rates, more particularly, flow thermal capacity. Simulations were done in a parallel computing environment with each cell computed in a separate (CPU) process. This natural decomposition of the fuel cell stack reduced the number of communicated variables thereby improving computational performance. The parallelization scheme implemented utilized a message passing interface (MPI) protocol where cell-to-cell communication is achieved via exchange of temperature and thermal fluxes between neighboring cells. "
}
@Article{SCDon2004,
author = "S. C. Dong and G. E. Karniadakis",
title = "Dual-level parallelism for high-order {CFD} methods",
journal = "Parallel Computing",
volume = "30",
number = "1",
pages = "1--20",
month = JAN,
year = "2004",
abstract = "A hybrid two-level parallel paradigm with MPI/OpenMP is presented in the context of high-order methods and implemented in the spectral/hp element framework to take advantage of the hierarchical structures arising from deterministic and stochastic CFD problems. We take a coarse grain approach to OpenMP shared-memory parallelization and employ a workload-splitting scheme that reduces the OpenMP synchronizations to the minimum. The hybrid algorithm shows good scalability with respect to both the problem size and the numberof processors for a fixed problem size. For the same number of processors,the hybrid model with 2 OpenMP threads per MPI process is observed to perform better than pure MPI and pure OpenMP on the SGI Origin 2000 and the Intel IA64 Cluster, while the pure MPI model performs the best on the IBM SP3 and on the Compaq Alpha Cluster. A key new result is that the use of threads facilitates effectively p-refinement, which is crucial to adaptive discretization using high-order methods."
}
@Article{AVGer2004,
author = "A. V. Gerbessiotis",
title = "Architecture independent parallel binomial tree option price valuations",
journal = "Parallel Computing",
volume = "30",
number = "2",
pages = "301--316",
month = FEB,
year = "2004",
abstract = "We introduce an architecture independent approach in describing how computations such as those involved in American or European-style option price valuations can be performed in parallel under the binomial tree model. We describe a latency-tolerant parallel algorithm for the multiplicative binomial tree option pricing model. The algorithm is described and analyzed in an architecture independent setting and performance characteristics are expressed in terms of problem size n, the time horizon, and the parameters p, L andg of the bulk-synchronous parallel model of computation. The algorithm achieves optimal theoretical speedup and is within a I + o(l) multiplicative factor of the corresponding sequential method. An experimental study of an implementation of the algorithm on a cluster of PC workstations is also undertaken to examine the latency-tolerance of our approach. The implementationwith only a recompilation of the same source code works under two diverse parallel programming libraries namely, MPI and BSPlib, thus making it not only architecture but also communication library independent. "
}
@Article{VBlan2004,
author = "V. Blanco and P. Gonzalez and J. C. Cabaleiro and D. B. Heras and T. F. Pena and Pombo",
title = "Performance prediction for parallel iterative solvers",
journal = "Journal of Supercomputing",
volume = "28",
number = "2",
pages = "177--191",
month = MAY,
year = "2004",
abstract = "In this paper, an exhaustive parallel library of sparse iterative methods and preconditioners in HPF and MPI was developed, and a model for predictingthe performance of these codes is presented. This model can be used both by users and by library developers to optimize the efficiency of the codes, as well as to simplify their use. The information offered by this model combines theoretical features of the methods and preconditioners in addition to certain practical considerations and predictions about aspects of the performance of their execution in distributed memory multiprocessors."
}

@Article{SSVad2004,
author = "S. S. Vadhiyar and G. E. Fagg and J. J. Dongarra",
title = "Towards an accurate model for collective communications",
journal = "International Journal of High Performance Computing Applications",
volume = "18",
number = "1",
pages = "159--167",
month = SPR,
year = "2004",
abstract = "The performance of the MPI's collective communications is critical in most MPI-based applications. A general algorithm for a given collective communication operation may not give good performance on all systems due to the differences in architectures, network parameters and the storage capacity of the underlying MPI implementation. Hence, collective communications have to be tuned for the system on which they will be executed. In order to determine the optimum parameters of collective communications on a given system ina time-efficient manner, the collective communications need to be modeled efficiently. In this paper, we discuss various techniques for modeling collective communications."
}

@Article{PAmic2004,
author = "P. Amico and L. Bosi and C. Cattuto and L. Gammaitoni and M. Punturo and Travasso",
title = "A computational test facility for distributed analysis of gravitational wave signals",
journal = "Classical and Quantum Gravity",
volume = "21",
number = "5",
pages = "S847-S851",
month = MAR,
year = "2004",
abstract = "In the gravitational wave detector Virgo, the in-time detection of a gravitational wave signal from a coalescing binary stellar system is an intensivecomputational task. A parallel computing scheme using the message passing interface (MPI) is described. Performance results on a small-scale cluster are reported."
}

@Article{FAcer2004,
author = "F. Acernese and F. Barone and R. De Rosa and A. Eleuteri and L. Milano and Pardi",
title = "A multi-standard farm prototype for gravitational wave signal analysis",
journal = "Classical and Quantum Gravity",
volume = "21",
number = "5",
pages = "S837-S842",
month = MAR,
year = "2004",
abstract = "We implemented in Napoli a new general purpose farm prototype for the development and testing of gravitational wave data analysis algorithms. Its mainfeature is that it allows the users to dynamically change its configuration according to the data analysis tests. In fact, the farm is fully remotelyreconfigurable in-time as an MPI farm, a MOSIX farm or a GRID, configurations that may also coexist as independent subsets. Furthermore, the farm uses only standard hardware and software, guaranteeing easy upgrades and direct integration with other farms. In this paper we will describe this facility and further developments."
}

@Article{STomo2004,
author = "S. Tomov and R. Bennett and M. McGuigan and A. Peskin and G. Smith and J. Spiletic",
title = "Application of interactive parallel visualization for commodity-based clusters using visualization {API}s",
journal = "Computers \& Graphics-Uk",
volume = "28",
number = "2",
pages = "273--278",
month = APR,
year = "2004",
abstract = {We present an efficient and inexpensive to develop application for interactive high-performance parallel visualization. We extend popular APIs such asOpen Inventor and VTK to support commodity-based cluster visualization. Our implementation follows a standard master/slave concept: the general idea is to have a "Master" node, which will intercept a sequential graphical user interface and broadcast it to the "Slave" nodes. The interactions betweenthe nodes are implemented using MPI The parallel remote rendering uses Chromium. This paper is mainly the report of our implementation experiences. We present in detail the proposed model and key aspects of its implementation. Also, we present performance measurements, we benchmark and quantitatively demonstrate the dependence of the visualization speed on the data size and the network bandwidth, and we identify the singularities and draw conclusions on Chromium's sort-first rendering architecture. The most original part of this work is the combined use of Open Inventor and Chromium.}
}
@Article{ARMRa2004,
author = "A. R. M. Rao and T. V. S. R. A. Rao and B. Dattaguru",
title = "Comparative efficiencies of three parallel algorithms for nonlinear implicit transient dynamic analysis",
journal = "Sadhana-Academy Proceedings in Engineering Sciences",
volume = "29",
number = "pt. 1",
pages = "57--81",
month = FEB,
year = "2004",
abstract = "The work reported in this paper is motivated by the need to develop portable parallel processing algorithms and codes which can run on a variety of hardware platforms without any modifications. The prime aim of the research work reported here is to test the portability of the parallel algorithms andalso to study and understand the comparative efficiencies of three parallel algorithms developed for implicit time integration technique. The standard message passing interface (MPI) is used to develop parallel algorithms for computing nonlinear dynamic response of large structures employing implicit time-marching scheme. The parallel algorithms presented in this paper are developed under the broad framework of non-overlapped domain decomposition technique. Numerical studies indicate that the parallel algorithm devisedemploying the conventional form of Newmark time integration algorithm is faster than the predictor-corrector form. It is also accurate and highly adaptive to fine grain computations. The group. implicit algorithm is found tobe extremely superior in performance when compared to the other two parallel algorithms. This algorithm is better suited for large size problems on coarse grain environment as the resulting submeshes will obviously be large and thus permit larger time steps without losing accuracy."
}



@Article{VChau2004,
author = "V. Chaudhary and W. L. Hase and H. Jiang and L. Sun and D. Thaker",
title = "Experiments with parallelizing tribology simulations",
journal = "Journal of Supercomputing",
volume = "28",
number = "3",
pages = "323--343",
month = JUN,
year = "2004",
abstract = "Different parallelization methods vary in their system requirements, programming styles, efficiency of exploring parallelism, and the application characteristics they can handle. For different situations, they can exhibit totally different performance gains. This paper compares OpenMP, MPI, and Strings for parallelizing a complicated tribology problem. The problem size andcomputing infrastructure is changed to assess the impact of this on various parallelization methods. All of them exhibit good performance improvements and it exhibits the necessity and importance of applying parallelization in this field."
}
@Article{MMuri2004,
author = "M. Murillo and X. C. Cai",
title = "A fully implicit parallel algorithm for simulating the non-linear electrical activity of the heart",
journal = "Numerical Linear Algebra with Applications",
volume = "11",
number = "2-3",
pages = "261--277",
month = MAR-APR,
year = "2004",
abstract = "In this paper, we study a fully implicit parallel Newton-Krylov-Schwarz method (NKS) for solving the bidomain equations describing the electrical excitation process of the heart. NKS has been used successfully for many non-linear problems, but this is the first attempt to use this method for the bidomain model which consists of a system of time dependent partial differential equations of mixed type. Our experiments on parallel computers show thatthe method is scalable and robust with respect to many of the parameters in the bidomain model. In the outer layer of the algorithm, we use a nonlinearly implicit backward Euler method to discretize the time derivative, and the resulting systems of large sparse non-linear equations are solved usingan inexact Newton method. The Jacobian system required to solve in each Newton iteration is solved with a GMRES method preconditioned by a new component-wise restricted additive Schwarz preconditioner. The efficiency and robustness of the overall method depend heavily on what preconditioner we use.By comparing several preconditioners, we found our new restricted additiveSchwarz method offers the best performance. Our parallel software is developed using the PETSc package of Argonne National Laboratory. Numerical results obtained on an IBM SP will be reported. "
}
@Article{EPLia2004,
author = "E. P. Li and H. F. Jin and S. Wang and L. W. Li",
title = "Signal propagation in high speed differential transmission line using parallelized finite-difference time-domain method",
journal = "Journal of Electromagnetic Waves and Applications",
volume = "18",
number = "4",
pages = "437--454",
month = "",
year = "2004",
abstract = "This paper presents the investigations in the electric performance of differential signalling transmission lines used for high speed integrated circuits (IC's) and boards by using the parallelized finite-difference time-domain (FDTD) method. The FDTD method is firstly parallelized with single-program multiple-data (SPAMD) architecture using the MPI protocol and experimentally validated. The key electrical factors, crosstalk, impedance of highspeed differential transmission lines, are simulated and investigated for various configuration using the developed parallelized FDTD code. The discussions presented in this paper shall be used a guideline for engineers to optimize high-speed circuit designs with differential signaling transmission lines for signal integrity (SI) and electromagnetic compatibility (EMC)."
}

@InProceedings{shir99:mpi-analysis,
author = {Dale Shires and Lori Pollock and Sara Sprenkle},
title = {Program Flow Graph Construction for Static Analysis of {MPI} Programs},
booktitle = {Proceedings of the conference on Parallel and Distributed Processing Techniques and Applications},
pages = {1847-1853},
year = 1999,
month = JUN,
abstract = {The Message Passing Interface MPI has been widely used to develop e cient andportable parallel programs for distributed memorymultiprocessors and workstation PC clusters Inthis paper we present an algorithm for buildinga program ow graph representation of an MPIprogram As an extension of the control owgraph representation of sequential codes this representation provides a basis for important program analyses useful in software testing, debugggin, and code optimization}
}

@TechReport{sie03:mpi-analysis,
author = {Stephen F. Siegel and George Avrunin},
title = {Analysis of MPI Programs},
institution = {Department of Computer Science, University of Massachusetts Amherst},
year = 2003,
number = {UM-CS-2003-036},
abstract = {We investigate the application of formal verification techniques to parallel programs that employ the Message Passing Interface (MPI). We develop a formal model of a subset of MPI, and then prove a number of theorems about that model that ameliorate or eliminate altogether the state explosion problem. As an example, we show that if one wishes to verify freedom from deadlock, it suffices to consider only synchronous executions.},
}

@InProceedings{Stellner96,
author = "G. Stellner",
title = "CoCheck: Checkpointing and Process Migration for {MPI}",
booktitle = "Proc. 10th Int. Parallel Processing Symp. (IPPS'96) CD-ROM",
publisher = "IEEE",
address = "Honolulu, HA",
month = apr,
year = "1996",
keywords = "Synchronization, Virtual Memory, and Runtime System,",
}

@InProceedings{bon03:mpi-other,
author = {Dan Bonachea and Jason Duell},
title = {Problems with using {MPI} 1.1 and 2.0 as compilation targets for parallel language implementations},
booktitle = {2nd Workshop on Hardware/Software Support for High Performance Scientific and Engineering Computing, SHPSEC-PACT03},
year = 2003,
note = {Also to appear in the International Journal on High Performance Computing and Networking (IJHPCN)}
}

@Article{DStan2004,
author = "D. Stanescu and D. Ait-Ali-Yahia and W. G. Habashi and M. P. Robichaud",
title = "Spectral element method for linear fan tone noise radiation",
journal = "AIAA Journal",
volume = "42",
number = "4",
pages = "696--705",
month = APR,
year = "2004",
abstract = "A numerical method for prediction of acoustic spinning mode radiation from turbofan inlets is presented. Sound propagation is modeled by the linearized mass conservation equation for irrotational flows and solved in the frequency domain. The mean flow through the inlet is obtained as a solution of the full potential equation. Both the mean-flow and the acoustic problem areapproximated by Galerkin projection in spectral element spaces of continuous piecewise polynomials defined on the same grid. The Gauss-Chebyshev-Lobatto points within the elements are generated via transfinite interpolation and CAP projection procedures embedded within the code. The linear algebraic systems obtained are then solved using either direct or sparse iterative solvers based on the message passing interface standard for interprocessor communication. The singularity appearing in the acoustic integrals on the symmetry axis is treated by the use of a collocation operator based on the Gauss-Chebyshev, instead of the Gauss-Chebyshev-Lobatto, points. To eliminate reflections from the radiation boundaries, a novel frequency-domain formulation of the matched-layer technique, wherein waves entering the layer areexponentially damped, is proposed. The overall computing procedure is first validated on a tone radiation problem from a semi-infinite cylinder and then applied to an experimental JT15D turbofan inlet setup."
}

@Article{DIron2004,
author = "D. Irony and G. Shklarski and S. Toledo",
title = "Parallel and fully recursive multifrontal sparse {C}holesky",
journal = "Future Generation Computer Systems",
volume = "20",
number = "3",
pages = "425--440",
month = APR,
year = "2004",
abstract = "We describe the design, implementation, and performance of a new parallel sparse Cholesky factorization code. The code uses a multifrontal factorization strategy. Operations on small dense submatrices are performed using new dense matrix subroutines that are part of the code, although the code can also use the BLAs and LAPACK. The new code is recursive at both the sparse and the dense levels, it uses a novel recursive data layout for dense submatrices, and it is parallelized using Cilk, an extension of C specifically designed to parallelize recursive codes. We demonstrate that the new code performs well and scales well on SMPs. In particular, on up to 16 processors, the code outperforms two state-of-the-art message-passing codes. The scalability and high performance that the code achieves imply that recursive schedules, blocked data layouts, and dynamic scheduling are effective in the implementation of sparse factorization codes."
}

@Article{FGarc2004,
author = "F. Garcia-Carballeira and J. Carretero and A. Calderon and J. M. Perez and J. D. Garcia",
title = "An adaptive cache coherence protocol specification for parallel input/output systems",
journal = "IEEE Transactions on Parallel and Distributed Systems",
volume = "15",
number = "6",
pages = "533--545",
month = JUN,
year = "2004",
abstract = "Caching has been intensively used in memory and traditional file systems toimprove system performance. However, the use of caching in parallel file systems and I/O libraries has been limited to I/O nodes to avoid cache coherence problems. In this paper, we specify an adaptive cache coherence protocol very suitable for parallel file systems and parallel I/O libraries. Thismodel exploits the use of caching, both at processing and I/O nodes, providing performance increase mechanisms as aggressive prefetching and delayed-write techniques. The cache coherence problem is solved by using a dynamic scheme of cache coherence protocols with different sizes and shapes of granularity. The proposed model is very appropriate for parallel I/O interfaces, as MPI-IO. Performance results, obtained on an IBM SP2, are presented to demonstrate the advantages offered by the cache management methods proposed."
}

@Article{CBoer2004,
author = "C. Boeres and V. E. F. Rebello",
title = "Easy{G}rid: towards a framework for the automatic {G}rid enabling of legacy {MPI}applications",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "16",
number = "5",
pages = "425--432",
month = APR,
year = "2004",
abstract = "One of the goals of the Grid is to aggregate collections of shared, heterogeneous, and distributed resources to provide computational 'power' to parallel applications. However, designing applications capable of exploiting this potential with ease remains a challenge. This paper outlines the EasyGridmethodology for the efficient and robust execution of (legacy) MPI programs across distributed computing clusters. The principal objective of this work is to identify the application-oriented middleware necessary for, as well as to develop a framework to automatically generate, system-aware applications capable of executing in dynamic, unstable, distributed environments such as computational Grids. "
}

@Article{BButr2004,
author = "B. Butrylo and C. Vollaire and L. Nicolas and A. Nicolas",
title = "Numerical performance of the distributed vector finite-element time-domain algorithm",
journal = "IEEE Transactions on Magnetics",
volume = "40",
number = "2",
pages = "997--1000",
month = MAR,
year = "2004",
abstract = "This paper deals with a distributed time-domain modeling of electromagneticphenomena with the finite-element method. The model is approximated by edge elements. The constitutive equations and method of parallelization of thealgorithm are presented. The properties -of the distributed finite-elementtime-domain algorithm are discussed. Some typical performance metrics are studied for the parallel versions, of the software. The presented algorithmis executed on a heterogeneous and a homogeneous clusters of workstations.Two different distributed memory environments (MPI and PVM) are used to evaluate the efficiency of the algorithm."
}

@Article{THana2004,
author = "T. Hanawa and S. Ikuno and A. Kamitani",
title = "Application of parallelized multigrid method to solution of {MHD} equilibrium with {MPI}",
journal = "IEEE Transactions on Magnetics",
volume = "40",
number = "2",
pages = "1005--1008",
month = MAR,
year = "2004",
abstract = "The potential of applying the multigrid method (MGM) to magnetohydrodynamics (MHD) equilibrium analysis is investigated. The nonlinear eigenvalue problem often appears when the MHD equilibria are determined by solving the Grad-Shafranov equation numerically. After linearizing the equation, the problem is solved using an iterative procedure. Although the SOR method or the Gauss-Seidel method is often used for the solution of the linearized equation, it takes much CPU time to solve the problem. We introduced the use of MGM instead of the conventional method for solving the linear equation. The parallel processing by using message passing interface (MPI) on a PC clusteris adopted for implementation of the MGM to achieve higher performance."
}

@Article{KTagu2004,
author = "K. Taguchi and M. Uchiya and T. Kashiwa and K. Hirayama and H. Kuribayashi and S. Komatsu",
title = "F{DTD} large-scale parallel supercomputing and its application to the analysis of radiation characteristics of an antenna mounted on a vehicle",
journal = "International Journal of Rf and Microwave Computer-Aided Engineering",
volume = "14",
number = "3",
pages = "253--261",
month = MAY,
year = "2004",
abstract = "Given the remarkable advances in supercomputers, large-scale electromagnetic-field analyses are becoming possible by FDTD parallel computation. In this study, a versatile FDTD parallel computation system is developed by usingFortran90 and the MPI library. The system uses dynamic memory allocation,which provides a more versatile system and more efficient use of memory. Using the system, we analyze, for the first time, the radiation characteristics of an antenna mounted on a realistic vehicle model."
}

@Article{YMizu2004,
author = "Y. Mizutani and F. Ino and K. Hagihara",
title = "Evaluation of performance prediction method for master/slave parallel programs",
journal = "Ieice Transactions on Information and Systems",
volume = "E87D",
number = "4",
pages = "967--975",
month = APR,
year = "2004",
abstract = "This paper describes the design and implementation of a testbed for predicting master/slave (M/S) programs written using Message Passing Inter-face (MPI) programs. The testbed, named M/S Emulator (MSE), aims at assisting developers in evaluating the performance of M/S programs and dynamic load-balancing strategies on clusters of PCs. In order to realize this. MSE predicts the communication time by using a realistic parallel computational model, an extension of the LogGPS model. This extended model improves the prediction accuracy on a large number of processors. because it captures the master's bottleneck: the overhead required for retrieving arrival messages from the slaves. Current MSE also employs a best effort emulation method for predicting the calculation time. In our experiments, MSE demonstrated an accurate prediction on clusters, especially on a larger number of nodes. Therefore, we believe that our extended model enables us to analyze the scalability of the M/S program performance."
}

@Article{JMOrd2004,
author = "J. M. Orduna and F. Silla and J. Duato",
title = "On the development of a communication-aware task mapping technique",
journal = "Journal of Systems Architecture",
volume = "50",
number = "4",
pages = "207--220",
month = MAR,
year = "2004",
abstract = "In this paper, we propose a communication-aware mapping technique that tries to match as well as possible the existing network resources to the communication requirements of the applications running on the system. Also, we evaluate the mapping technique using real MPI application traces with timestamps. Evaluation results show that the use of the proposed mapping techniquebetter exploits the available network bandwidth, improving load balancing and increasing the throughput that can be delivered by the network. Therefore, the proposed technique can be used in the design of communication-awarescheduling strategies for those situations where the communication requirements lead the network bandwidth to become the system performance bottleneck. "
}

@Article{YKSuh2004,
author = "Y. K. Suh and J. H. Park and S. K. Kim and Y. R. Son",
title = "Study on the periodic flows in a rectangular container under a background rotation",
journal = "Ksme International Journal",
volume = "18",
number = "4",
pages = "671--680",
month = APR,
year = "2004",
abstract = "We present numerical and experimental results of the periodic flows inside a rectangular container under a background rotation. In numerical computation, a parallel-computation technique with MPI is implemented. Flow visualization and PIV measurement are also performed to obtain velocity fields at the free surface. Through a series of numerical and experimental works, we aim to clarify the fundamental reasons of discrepancy between the two-dimensional computation and the experimental measurement, which was detected in the previous study for the same flow model. Specifically, we check if the various assumptions prerequisite for the validity of the classical Ekman pumping law are satisfied for periodic flows under a background rotation."
}

@Article{MYHaa2004,
author = "M. Y. Ha and J. G. Kim",
title = "Numerical simulation of natural convection in annuli with internal fins",
journal = "Ksme International Journal",
volume = "18",
number = "4",
pages = "718--730",
month = APR,
year = "2004",
abstract = "The solution for the natural convection in internally finned horizontal annuli is obtained by using a numerical simulation of time-dependent and two-dimensional governing equations. The fins existing in annuli influence the flow pattern, temperature distribution and heat transfer rate. The variations of the fin configuration suppress or accelerate the free convective effects compared to those of the smooth tubes. The effects of fin configuration,number of fins and ratio of annulus gap width to the inner cylinder radiuson the fluid flow and heat transfer in annuli are demonstrated by the distribution of the velocity vector, isotherms and streamlines. The governing equations are solved efficiently by using a parallel implementation. The technique is adopted for reduction of the computation cost. The parallelization is performed with the domain decomposition technique and message passing between sub-domains on the basis of the MPI library. The results from parallel computation reveal in consistency with those of the sequential program.Moreover, the speed-up ratio shows linearity with the number of processor."
}

@Article{JEbed2004,
author = "J. Ebedes and A. Datta",
title = "Multiple sequence alignment in parallel on a workstation cluster",
journal = "Bioinformatics",
volume = "20",
number = "7",
pages = "1193--1195",
month = MAY,
year = "2004",
abstract = "Multiple sequence alignment is the NP-hard problem of aligning three or more DNA or amino acid sequences in an optimal way so as to match as many characters as possible from the set of sequences. The popular sequence alignment program ClustalW uses the classical method of approximating a sequence alignment, by first computing a distance matrix and then constructing a guidetree to show the evolutionary relationship of the sequences. We show that parallelizing the ClustalW algorithm can result in significant speedup. We used a cluster of workstations using C and message passing interface for our implementation. Experimental results show that speedup of over 5.5 on sixprocessors is obtainable for most inputs."
}

@Article{FVThe2004,
author = "F. V. Theos and I. E. Lagaris and D. G. Papageorgiou",
title = "P{ANMIN}: sequential and parallel global optimization procedures with a variety of options for the local search strategy",
journal = "Computer Physics Communications",
volume = "159",
number = "1",
pages = "63--69",
month = MAY,
year = "2004",
abstract = "Title of program: PANMIN Catalogue identifier: ADSU Program summary URL: http://cpc.cs.qub.ac.uk/summaries/ADSU Program obtainable from: CPC Program Library, Queen's University of Belfast, N. Ireland Computer for which the program is designed and others on which it has been tested: PANMIN is designed for UNIX machines. The parallel code runs on either shared memory architectures or on a distributed system. The code has been tested on a SUN Microsystems ENTERPRISE 450 with four CPUs, and on a 48-node cluster under Linux,with both the GNU g77 and the Portland group compilers. The parallel implementation is based on MPI and has been tested with LAM MPI and MPICH Installation. University of Ioannina, Greece Programming language used: Fortran-77 Memory required to execute with typical data: Approximately O(n(2)) words, where n is the number of variables No. of bits in a word: 64 No. of processors used: 1 or many Has the code been vectorised or parallelized?: Parallelized using MPI No. of bytes in distributed program, including test data, etc.: 147163 No. of lines in distributed program, including the test data, etc.: 14366 Distribution format: gzipped tar file."
}

@Article{JPein2004,
author = "J. Peinado and A. M. Vidal",
title = "A parallel {B}royden approach to the {T}oeplitz inverse eigenproblem",
journal = "Concurrency and Computation-Practice \& Experience",
volume = "16",
number = "6",
pages = "587--610",
month = MAY,
year = "2004",
abstract = "In this work we show a portable sequential and a portable parallel algorithm for solving the inverse eigenproblem for real symmetric Toeplitz matrices. Both algorithms are based on Broyden's method for solving nonlinear systems. We reduced the computational cost for some problem sizes, and furthermore we managed to reduce spatial cost considerably, compared in both cases with parallel algorithms proposed by other authors and by us, although sometimes quasi-Newton methods (as Broyden) do not reach convergence in all the test cases. We have implemented the parallel algorithm using the parallel numerical linear algebra library SCALAPACK based on the MPI environment. Experimental results have been obtained using two different architectures: a shared memory multiprocessor, the SGI PowerChallenge, and a cluster of Pentium II PCs connected through a myrinet network. The algorithms obtained are scalable in all the cases. "
}

@Article{MCole2004,
author = "M. Cole",
title = "Bringing skeletons out of the closet: a pragmatic manifesto for skeletal parallel programming",
journal = "Parallel Computing",
volume = "30",
number = "3",
pages = "389--406",
month = MAR,
year = "2004",
abstract = "Skeleton and pattern based parallel programming promise significant benefits but remain absent from mainstream practice. We consider why this situation has arisen and propose a number of design principles which may help to redress it. We sketch the eSkel library, which represents a concrete attempt to apply these principles. eSkel is based on C and MPI, thereby embedding its skeletons in a conceptually familiar framework. We presert an application of eSkel and analyse it as a response to our manifesto. "
}

@Article{BButr2004,
author = "B. Butrylo and F. Musy and L. Nicolas and R. Perrussel and R. Scorretti and C. Vollaire",
title = "A survey of parallel solvers for the finite element method in computational electromagnetics",
journal = "Compel-the International Journal for Computation and Mathematics in Electrical and Electronic Engineering",
volume = "23",
number = "2",
pages = "531--546",
month = "",
year = "2004",
abstract = "This paper presents new trends in parallel methods used to solve finite element matrix systems: standard iterative and direct solving methods first, and then domain decomposition methods. For example, the current status and properties of two prevailing programming environments (PVM and MPI) are finally given and compared when implemented together with a finite element timedomain formulation."
}
@Article{SCaho2004,
author = "S. Cahon and N. Melab and E. G. Talbi",
title = "{ParadisEO}: {A} framework for the reusable design of parallel and distributed metaheuristics",
journal = "Journal of Heuristics",
volume = "10",
number = "3",
pages = "357--380",
month = MAY,
year = "2004",
abstract = "In this paper, we present the ParadisEO white-box object-oriented frameworkdedicated to the reusable design of parallel and distributed metaheuristics (PDM). ParadisEO provides a broad range of features including evolutionary algorithms (EA), local searches (LS), the most common parallel and distributed models and hybridization mechanisms, etc. This high content and utility encourages its use at European level. ParadisEO is based on a clear conceptual separation of the solution methods from the problems they are intended to solve. This separation confers to the user a maximum code and design reuse. Furthermore, the fine-grained nature of the classes provided by the framework allow a higher flexibility compared to other frameworks. ParadisEO is of the rare frameworks that provide the most common parallel and distributed models. Their implementation is portable on distributed-memory machines as well as on shared-memory multiprocessors, as it uses standard libraries such as MPI, PVM and PThreads. The models can be exploited in a transparent way, one has just to instantiate their associated provided classes. Their experimentation on the radio network design real-world application demonstrate their efficiency."
}

@Article{YJWan2004,
author = "Y. J. Wang and X. C. Nie and L. W. Li and E. P. Li",
title = "A parallel analysis of the scattering from inhomogeneous dielectric bodies by the volume integral equation and the precorrected-{FFT} algorithm",
journal = "Microwave and Optical Technology Letters",
volume = "42",
number = "1",
pages = "77--79",
month = JUL,
year = "2004",
abstract = "In this paper, a parallel implementation of the precorrected fast Fourier transform (FFT) algorithm is presented to efficiently solve the volume-integral equation for scattering from inhomogeneous dielectric objects. Several examples are given to demonstrate the efficiency and correctness of the message-passing interface (MPI)-based parallelization algorithm."
}

@Article{UTrem2004,
author = "U. Tremel and F. Deister and O. Hassan and N. P. Weatherill",
title = "Automatic unstructured surface mesh generation for complex configurations",
journal = "International Journal for Numerical Methods in Fluids",
volume = "45",
number = "4",
pages = "341--364",
month = JUN,
year = "2004",
abstract = "In this paper a new object-oriented (OO) approach is presented for automatic parallel advancing front based surface mesh generation and adaptive remeshing for complex configurations. Based on the ST++-system the advantages ofthe OO design and implementation compared to the traditional structural approach are described. Algorithmic enhancements to the advancing front method are explained, enabling a robust NURBS based triangulation process directly on B-rep CAD data. The message passing (MPI) parallelization strategy together with the achievable performance improvements are demonstrated. With the outlined parallel geometry analysis/rasterization a powerful method is described to derive automatically a well suited mesh size specification without any user-interaction from scratch. The application of this method to acomplex 'real world' example finishes this paper. "
}
@Article{JXLiu2004,
author = "J. X. Liu and J. S. Wu and D. K. Panda",
title = "High performance {RDMA}-based {MPI} implementation over {I}nfini{B}and",
journal = "International Journal of Parallel Programming",
volume = "32",
number = "3",
pages = "167--198",
month = MAY,
year = "2004",
abstract = "Although InfiniBand Architecture is relatively new in the high performance computing area, it offers many features which help us to improve the performance of communication subsystems. One of these features is Remote Direct Memory Access (RDMA) operations. In this paper, we propose a new design of MPI over InfiniBand which brings the benefit of RDMA to not only large messages, but also small and control messages. We also achieve better scalability by exploiting application communication pattern and combining send/receive operations with RDMA operations. Our RDMA-based MPI implementation achieves a latency of 6.8 musec for small messages and a peak bandwidth of 871 million bytes/sec. Performance evaluation shows that for small messages, our RDMA-based design can reduce the latency by 24\%, increase the bandwidth by over 104\%, and reduce the host overhead by up to 22\% compared with the original design. For large data transfers, we improve performance by reducing the time for transferring control messages. We have also shown that our new design is beneficial to MPI collective communication and NAS Parallel Benchmarks."
}
@Article{ISaso2004,
author = "I. Sason and R. Urbanke",
title = "Complexity versus performance of capacity-achieving irregular repeat-accumulate codes on the binary erasure channel",
journal = "IEEE Transactions on Information Theory",
volume = "50",
number = "6",
pages = "1247--1256",
month = JUN,
year = "2004",
abstract = "We derive upper and lower bounds on the encoding and decoding complexity oftwo capacity-achieving ensembles of irregular repeat-accumulate (IRA1 and IRA2) codes on the binary erasure channel (BEC). These bounds are expressedin terms of the gap between the channel capacity and the rate of a typicalcode from this ensemble for which reliable communications is achievable under message-passing iterative (MPI) decoding. The complexity. of the ensemble of IRA1 codes grows like the negative logarithm of the gap to capacity. On the other hand, the complexity of the ensemble of IRA2 codes with any choice of the degree distribution grows at least like the inverse square rootof the gap to capacity, and at most like the inverse of the gap to capacity."
}
@Article{ATezu2004,
author = "A. Tezuka and J. Matsumoto and K. Matsubara",
title = "Development of common software platform on parallel computations for discretized numerical schemes and its application to finite element fluid dynamics",
journal = "International Journal of Computational Fluid Dynamics",
volume = "18",
number = "4",
pages = "347--354",
month = MAY,
year = "2004",
abstract = "It is a time consuming and very skillful task for researchers or developerson computational mechanics to modify a program for a single processor to the one for parallel computation. This is a serious bottleneck for parallel computation, even though general-purpose parallel computational library such as MPI is applied in his modification. We developed a parallel matrix solver platform, called 'Parallel Computing Platform/PCP, based on domain de composition scheme for various numerical schemes such as finite element method (FEM), finite difference method (FDM) and finite volume method (FVM) to accelerate a smooth shift to parallel computational world. Some parallel software such as PETSc, Aztec, GEOFEM and ADVENTURE had been developed, however, these are for professionals in parallel computations and not valid for our purpose. In our platform, what a user should do is just to call the platform at the stage of stiffness matrix calculation. GMRES and Bi-CGSTAB with some pre-conditioners are used as a basic matrix solver. The option of Lagrange-multiplier is also attached. For the partitioning, a fast graph generator for arbitrary elements and the interface with MeTis are equipped. Ourplatform is valid for a variety of hardware, including single processor based WS, by exchanging Makefile.in. The effectiveness of our platform is evaluated with several examples in finite element fluid dynamics."
}
@Article{DJKer2004,
author = "D. J. Kerbyson and A. Hoisie and S. Pakin and F. Petrini and H. J. Wasserman",
title = "A performance evaluation of an alpha {EV}7 processing node",
journal = "International Journal of High Performance Computing Applications",
volume = "18",
number = "2",
pages = "199--209",
month = SUM,
year = "2004",
abstract = "In this paper we detail the performance of a new AlphaServer node containing 16 Alpha EV7 CPUs. The EV7 processor is based on the EV68 processor core that is used in terascale systems at Los Alamos National Laboratory and thePittsburgh Supercomputing Center. The EV68 processor core is supplemented with six-way router circuitry that forms connections from the processor internals to four neighboring CPUs in a two-dimensional torus, to a I/O controller and to local memory. The performance evaluation presented in this paper considers memory hierarchy, intra-node MPI communication, and also the performance of a number of complete applications. The measurements are compared with those taken on existing AlphaServer machines. It is clear from our analysis that the superior application performance of the EV7 relative to asimilarspeed EV68 is attributable to its excellent main memory bandwidth -over 4 GB/s."
}
@Article{SSrin2004,
author = "S. Srinivasan and R. S. Miller and E. Marotta",
title = "Parallel computation of the {B}oltzmann transport equation for microscale heat transfer in multilayered thin films",
journal = "Numerical Heat Transfer Part B-Fundamentals",
volume = "46",
nu