@Article{thakur:applications, author = {Rajeev Thakur and Ewing Lusk and William Gropp}, title = {{I/O} in Parallel Applications: The Weakest Link}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Winter}, volume = {12}, number = {4}, pages = {389--395}, URL = {http://www.mcs.anl.gov/~thakur/papers/ijsa-article.ps.gz}, keyword = {parallel I/O application, pario-bib}, abstract = {Parallel computers are increasingly being used to run large-scale applications that also have huge I/O requirements. However, many applications obtain poor I/O performance on modern parallel machines. This special issue of IJSA contains papers that describe the I/O requirements and the techniques used to perform I/O in real parallel applications. We first explain how the I/O application program interface (API) plays a critical role in enabling such applications to achieve high I/O performance. We describe how the commonly used Unix I/O interface is inappropriate for parallel I/O and how an explicitly parallel API with support for collective I/O can help the underlying I/O hardware and software perform I/O efficiently. We then describe MPI-IO, a recently defined, standard, portable API specifically designed for high-performance parallel I/O. We conclude with an overview of the papers in this special issue.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} } @Article{oldfield:seismic, author = {Ron A. Oldfield and David E. Womble and Curtis C. Ober}, title = {Efficient Parallel {I/O} in Seismic Imaging}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Fall}, volume = {12}, number = {3}, pages = {333--344}, URL = {http://www.cs.dartmouth.edu/~raoldfi/ijsa97}, keyword = {parallel I/O application, pario-bib}, abstract = {While high performance computers tend to be measured by their processor and communications speeds, the bottleneck for many large-scale applications is the I/O performance rather than the computational or communication performance. One such application is the processing of 3D seismic data. Seismic data sets, consisting of recorded pressure waves, can be very large, sometimes more than a terabyte in size. Even if the computations can be performed in-core, the time required to read the initial seismic data and velocity model and write images is substantial. This paper will discuss our approach in handling the massive I/O requirements of seismic processing and show the performance of our imaging code (Salvo) on the Intel Paragon.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} } @Article{lockey:characterization, author = {P. Lockey and R. Proctor and I. D. James}, title = {Characterization of {I/O} Requirements in a Massively Parallel Shelf Sea Model}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Fall}, volume = {12}, number = {3}, pages = {320--332}, keyword = {parallel I/O application, pario-bib}, abstract = {It is now recognized that a high level of I/O performance is crucial in making effective use of parallel machines for many scientific application codes. This paper considers the I/O requirements in one particular scientific application area; 3D modelling of continental shelf sea regions. We identify some of the scientific aims which drive the model development, and the consequent impact on the I/O needs. As a case study we take a parallel production code running a simulation of the North Sea on a Cray~T3D platform and investigate the I/O performance in dealing with the dominant I/O component; dumping of results data to disk. In order to place the performance issues in a more general framework we construct a simple theoretical model of I/O requirements, and use this to probe the impact of available I/O performance on current and proposed scientific objectives.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} } @Article{kandaswamy:hartree-fock, author = {Meenakshi Kandaswamy and Mahmut Kandemir and Alok Choudhary and David Bernholdt}, title = {An Experimental Study to Analyze and Optimize Hartree-Fock Application's {I/O} with {PASSION}}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Winter}, volume = {12}, number = {4}, pages = {411--439}, keyword = {parallel I/O application, pario-bib}, abstract = {Many scientific applications tend to perform high volume data storage, data retrieval and data processing, which demands high performance from the I/O subsystem. The focus and contribution of this work is to study the I/O behavior of the Hartree-Fock method using PASSION. HF's I/O phases can contribute up to 62.34\% of the total execution time. We reduce the execution time and I/O time up to 54\% and 6\% respectively of that of the original case through PASSION and its optimizations. Additionally, we categorize the factors that affect the I/O performance of HF into key application-related parameters and key system-related parameters. Based on extensive empirical results and within our experimental space, we order the parameters according to their impact on HF's I/O performance as follows: efficient interface, prefetching, buffering, number of I/O nodes, striping factor and striping unit. We conclude that application-related factors have a more significant effect on HF's I/O performance than the system-related factors within our experimental space.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} } @Article{simitci:patterns, author = {Huseyin Simitci and Daniel Reed}, title = {A Comparison of Logical and Physical Parallel {I/O} Patterns}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Fall}, volume = {12}, number = {3}, pages = {364--380}, keyword = {parallel I/O application, pario-bib}, abstract = {Although there are several extant studies of parallel scientific application request patterns, there is little experimental data on the correlation of physical input/output patterns with application input/output stimuli. To understand these correlations, we have instrumented the SCSI device drivers of the Intel Paragon OSF/1 operating system to record key physical input/output activities and have correlated this data with the input/output patterns of scientific applications captured via the Pablo analysis toolkit. Our analysis shows that disk hardware features profoundly affect the distribution of request delays and that current parallel file systems respond to parallel application input/output patterns in non-scalable ways.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} } @Article{mackay:groundwater, author = {David Mackay and G. Mahinthakumar and Ed D'Azevedo}, title = {A Study of {I/O} in a Parallel Finite Element Groundwater Transport Code}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Fall}, volume = {12}, number = {3}, pages = {307--319}, keyword = {parallel I/O application, pario-bib}, abstract = {A parallel finite element groundwater transport code is used to compare three different strategies for performing parallel I/O: (1) have a single processor collect data and perform sequential I/O in large blocks, (2) use variations of vendor specific I/O extensions (3) use the EDONIO I/O library. Each processor performs many writes of one to four kilobytes to reorganize localdata in a global shared file. Our findings suggest having a single processor collect data and perform large block-contiguous operations may be quite efficient and portable for up to 32 processor configurations. This approach does not scale well for a larger number of processors since the single processor becomes a bottleneck for gathering data. The effective application I/O rate observed, which includes times for opening and closing files, is only a fraction of the peak device read/write rates. Some form of data redistribution and buffering in remote memory as performed in EDONIO may yield significant improvements for non-contiguous data I/O access patterns and short requests. Implementors of parallel I/O systems may consider some form of buffering as performed in EDONIO to speed up such I/O requirements.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} } @Article{nieplocha:chemio, author = {Jarek Nieplocha and Ian Foster and Rick Kendall}, title = {{ChemIO}: High-Performance Parallel {I/O} for Computational Chemistry Applications}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Fall}, volume = {12}, number = {3}, pages = {345--363}, earlier = {foster:chemio}, keyword = {parallel I/O application, pario-bib}, abstract = {Recent developments in I/O systems on scalable parallel computers have sparked renewed interest in out-of-core methods for computational chemistry. These methods can improve execution time significantly relative to "direct" methods, which perform many redundant computations. However, the widespread use of such out-of-core methods requires efficient and portable implementations of often complex I/O patterns. The ChemIO project has addressed this problem by defining an I/O interface that captures the I/O patterns found in important computational chemistry applications and by providing high-performance implementations of this interface on multiple platforms. This development not only broadens the user community for parallel I/O techniques but also provides new insights into the functionality required in general-purpose scalable I/O libraries and the techniques required to achieve high performance I/O on scalable parallel computers.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} } @Article{davis:rle, author = {G. Davis and L. Lau and R. Young and F. Duncalfe and L. Brebber}, title = {Parallel Run-Length Encoding Compression---Reducing {I/O} in Dynamic Environmental Simulations}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Winter}, volume = {12}, number = {4}, pages = {396--410}, keyword = {parallel I/O application, compression, pario-bib}, abstract = {Dynamic simulations based on time-varying inputs are extremely I/O intensive. This is shown by industrial applications generating environmental projections based on seasonal-to-interannual climate forecasts which have a compute to data-access ratio of O(n) leading to significant performance degradation. Exploitation of compression techniques such as Run-Length-Encoding (RLE) significantly reduces the I/O bottleneck and storage requirements. Unfortunately, traditional RLE algorithms do not perform well in a parallel-vector platform such as the Cray architecture. This paper describes the design and implementation of a new RLE algorithm based on data chunking and packing that exploits the Cray gather-scatter vector hardware and multiple processors. This innovative approach reduces I/O and file storage requirements on average by an order of magnitude. Data intensive applications such as the integration of environmental and global climate models now become practical in a realistic time frame.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} }