papers.bib

@misc{june14mpi,
  title = {{Parallel Programming with MPI}},
  author = {Pavan Balaji and Rajeev Thakur and Wesley Bland and Ken Raffenetti and Xin Zhao},
  month = {June},
  year = 2014,
  note = {{Argonne National Lab full day tutorial on MPI}},
  url = {http://www.mcs.anl.gov/~balaji/tutorials/argonne14_mpi.php}
}
@misc{lbl_mar_14,
  author = {Bland, Wesley},
  title = {Fault Tolerant Runtime Research @ {ANL}},
  month = {Mar},
  year = 2014,
  note = {{Lawrence Berkeley Laboratory Visit}},
  url = {http://www.mcs.anl.gov/~wbland/slides/2014-03-04_LBL_Visit.pptx}
}
@misc{llnl_feb_14,
  author = {Bland, Wesley},
  title = {Proposed Fault Tolerance for {MPI}-4},
  month = {Feb},
  year = 2014,
  note = {{Lawrence Livermore Laboratory Visit}},
  url = {http://www.mcs.anl.gov/~wbland/slides/2014-02-10_LLNL_MPI4FT_Presentation.pdf}
}
@inproceedings{ppopp14,
  author = {Yang, Chaoran and Bland, Wesley and Mellor-Crummey, John and Balaji, Pavan},
  title = {{Portable, MPI-interoperable Coarray Fortran}},
  booktitle = {Proceedings of the 19th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  series = {PPoPP '14},
  year = {2014},
  isbn = {978-1-4503-2656-8},
  location = {Orlando, Florida, USA},
  pages = {81--92},
  numpages = {12},
  url = {http://doi.acm.org/10.1145/2555243.2555270},
  doi = {10.1145/2555243.2555270},
  acmid = {2555270},
  publisher = {ACM},
  address = {New York, NY, USA},
  keywords = {MPI, PGAs, coarray fortran, interoperability}
}
@article{Bland01082013,
  author = {Bland, Wesley and Bouteiller, Aurelien and Herault, Thomas and Bosilca, George and Dongarra, Jack},
  title = {Post-failure recovery of MPI communication capability: Design and rationale},
  volume = {27},
  number = {3},
  pages = {244-254},
  year = {2013},
  doi = {10.1177/1094342013488238},
  url = {http://hpc.sagepub.com/content/27/3/244.abstract},
  eprint = {http://hpc.sagepub.com/content/27/3/244.full.pdf+html},
  journal = {International Journal of High Performance Computing Applications}
}
@misc{jlpc13,
  author = {Bland, Wesley},
  title = {Fault Tolerant Runtime Research @ {ANL}},
  month = {Nov},
  year = 2013,
  note = {{10th Joint Laboratory for Petascale Computing Workshop}},
  url = {http://www.mcs.anl.gov/~wbland/slides/jlpc13.pdf}
}
@misc{resilience12,
  author = {Bland, Wesley},
  title = {{User Level Failure Mitigation in MPI}},
  month = {Aug},
  year = 2012,
  note = {{Resilience Workshop co-located with Euro-Par}},
  url = {http://www.mcs.anl.gov/~wbland/slides/Resilience12.pdf}
}
@article{CPE:CPE3100,
  author = {Bland, Wesley and Du, Peng and Bouteiller, Aurelien and Herault,
        Thomas and Bosilca, George and Dongarra, Jack J.},
  title = {Extending the scope of the Checkpoint-on-Failure protocol for
        forward recovery in standard MPI},
  journal = {Concurrency and Computation: Practice and Experience},
  issn = {1532-0634},
  doi = {10.1002/cpe.3100},
  keywords = {fault tolerance, message passing interface, ABFT,
        Checkpoint-on-Failure},
  month = jun,
  year = {2013}
}
@misc{june13mpi,
  title = {{Parallel Programming with MPI}},
  author = {Pavan Balaji and Rajeev Thakur and Rusty Lusk and Wesley Bland},
  month = {June},
  year = 2013,
  note = {{Argonne National Lab full day tutorial on MPI}},
  url = {http://www.mcs.anl.gov/~balaji/tutorials/argonne13_mpi.php}
}
@article{Bland:2013cj,
  year = {2013},
  issn = {0010-485X},
  journal = {Computing},
  volume = {95},
  number = {12},
  doi = {10.1007/s00607-013-0331-3},
  title = {{An evaluation of User-Level Failure Mitigation support in MPI}},
  url = {http://dx.doi.org/10.1007/s00607-013-0331-3},
  publisher = {Springer Vienna},
  keywords = {MPI; Fault tolerance; User-level fault mitigation; 68M14;
        68M15},
  author = {Bland, Wesley and Bouteiller, Aurelien and Herault, Thomas and
        Hursey, Joshua and Bosilca, George and Dongarra, JackJ.},
  pages = {1171-1184}
}
@phdthesis{blandthesis,
  author = {Bland, Wesley},
  title = {{Toward Message Passing Failure Management}},
  school = {University of Tennessee, Knoxville},
  year = 2013,
  pdf = {http://www.mcs.anl.gov/~wbland/pdf/Bland_2013_Toward_Message_Passing_Failure_Management.pdf},
  http = {http://trace.tennessee.edu/utk_graddiss/1695/}
}
@incollection{Bland:Euro-Par:Resilience:2012,
  year = {2013},
  booktitle = {Euro-Par 2012: Parallel Processing Workshops},
  volume = {7640},
  series = {Lecture Notes in Computer Science},
  editor = {Caragiannis, Ioannis and Alexander, Michael and Badia, Rosa Maria
        and Cannataro, Mario and Costan, Alexandru and Danelutto, Marco and
        Desprez, Frederic and Krammer, Bettina and Sahuquillo, Julio and
        Scott, Stephen L. and Weidendorfer, Josef},
  title = {{User Level Failure Mitigation in MPI}},
  publisher = {Springer Berlin Heidelberg},
  author = {Bland, Wesley},
  pages = {499-504},
  pdf = {http://www.mcs.anl.gov/~wbland/slides/Resilience12.pdf},
  doi = {10.1007/978-3-642-36949-0_57}
}
@incollection{Bland:EuroMPI:2012,
  year = {2012},
  month = {Sep},
  booktitle = {Recent Advances in the Message Passing Interface},
  volume = {7490},
  series = {Lecture Notes in Computer Science},
  editor = {Traff, Jesper Larsson and Benkner, Siegfried and Dongarra,
        Jack J.},
  title = {{An Evaluation of User-Level Failure Mitigation Support in MPI}},
  publisher = {Springer Berlin Heidelberg},
  author = {Bland, Wesley and Bouteiller, Aurelien and Herault, Thomas and
        Hursey, Joshua and Bosilca, George and Dongarra, Jack J.},
  pages = {193-203},
  pdf = {http://www.mcs.anl.gov/~wbland/pdf/Bland_2012_An_Evaluation_of_User-Level_Failure_Mitigation_Support_in_MPI.pdf},
  doi = {10.1007/978-3-642-33518-1_24}
}
@incollection{Bland:Euro-Par:2012,
  year = {2012},
  month = {Aug},
  booktitle = {Euro-Par 2012 Parallel Processing},
  volume = {7484},
  series = {Lecture Notes in Computer Science},
  editor = {Kaklamanis, Christos and Papatheodorou, Theodore and Spirakis,
        Paul G.},
  title = {{A Checkpoint-on-Failure Protocol for Algorithm-Based Recovery
        in Standard MPI}},
  publisher = {Springer Berlin Heidelberg},
  author = {Bland, Wesley and Du, Peng and Bouteiller, Aurelien and
        Herault, Thomas and Bosilca, George and Dongarra, Jack},
  pages = {477-488},
  pdf = {http://www.mcs.anl.gov/~wbland/pdf/Bland_2012_A_checkpoint-on-failure_protocol_for_algorithm-based_recovery_in_standard_MPI.pdf},
  doi = {10.1007/978-3-642-32820-6_48}
}
@inproceedings{Bland:2012:EAR:2310096.2310161,
  author = {Bland, Wesley},
  title = {{Enabling Application Resilience with and Without the MPI
        Standard}},
  booktitle = {Proceedings of the 2012 12th IEEE/ACM International
        Symposium on Cluster, Cloud and Grid Computing (Ccgrid 2012)},
  series = {CCGRID '12},
  year = {2012},
  month = {May},
  pages = {746--751},
  numpages = {6},
  acmid = {2310161},
  publisher = {IEEE Computer Society},
  address = {Washington, DC, USA},
  keywords = {Fault Tolerance, Message Passing Interface,
        Distributed Runtime},
  pdf = {http://www.mcs.anl.gov/~wbland/pdf/Bland_2012_Enabling_Application_Resilience_with_and_without_the_MPI_Standard.pdf},
  doi = {10.1109/CCGrid.2012.25}
}
@techreport{bland2012proposal,
  title = {{A proposal for User-Level Failure Mitigation in the MPI-3
        Standard}},
  author = {Bland, Wesley and Bosilca, George and Bouteiller, Aurelien and
        Herault, Thomas and Dongarra, Jack},
  year = {2012},
  institution = {Tech. rep., Department of Electrical Engineering and
        Computer Science, University of Tennessee},
  pdf = {http://www.mcs.anl.gov/~wbland/pdf/Bland_2012_A_proposal_for_User-Level_Failure_Mitigation_in_the_MPI-3_standard.pdf}
}
@inproceedings{Naughton:2009:FIF:1552526.1552530,
  author = {Naughton, Thomas and Bland, Wesley and Vallee, Geoffroy and
        Engelmann, Christian and Scott, Stephen L.},
  title = {{Fault Injection Framework for System Resilience Evaluation:
        Fake Faults for Finding Future Failures}},
  booktitle = {Proceedings of the 2009 Workshop on Resiliency in High
        Performance},
  series = {Resilience '09},
  year = {2009},
  location = {Garching, Germany},
  pages = {23--28},
  numpages = {6},
  acmid = {1552530},
  publisher = {ACM},
  address = {New York, NY, USA},
  keywords = {fault injection, resilience},
  pdf = {http://www.mcs.anl.gov/~wbland/pdf/Naughton_2009_Fault_injection_framework_for_system_resilience_evaluation_fake_faults_for_finding_future_failures.pdf},
  doi = {10.1145/1552526.1552530}
}
@incollection{Vallee:Virtualization:2008,
  year = {2008},
  booktitle = {Systems and Virtualization Management. Standards and New
        Technologies},
  volume = {18},
  series = {Communications in Computer and Information Science},
  editor = {Boursas, Latifa and Carlson, Mark and Hommel, Wolfgang and
        Sibilla, Michelle and Wold, Kes},
  title = {{Virtual System Environments}},
  publisher = {Springer Berlin Heidelberg},
  author = {Vallee, Geoffroy and Naughton, Thomas and Ong, Hong and
        Tikotekar, Anand and Engelmann, Christian and Bland, Wesley and
        Aderholdt, Ferrol and Scott, Stephen L.},
  pages = {72-83},
  pdf = {http://www.mcs.anl.gov/~wbland/pdf/Vallee_2008_Virtual_System_Environments.pdf},
  doi = {10.1007/978-3-540-88708-9_7}
}
@inproceedings{4215574,
  author = {Bland, W. and Naughton, T. and Vallee, G. and Scott, S.L.},
  booktitle = {High Performance Computing Systems and Applications, 2007. HPCS
        2007. 21st International Symposium on},
  title = {{Design and Implementation of a Menu Based OSCAR Command Line
        Interface}},
  year = {2007},
  pages = {25-25},
  keywords = {graphical user interfaces;public domain software;workstation
        clusters;HPC clusters;graphical user interface;menu based OSCAR
            command line interface;open source cluster application
            resources;Application software;Automatic
            testing;Automation;Bandwidth;Computer science;Graphical user
            interfaces;High performance
            computing;Laboratories;Mathematics;Utility programs},
  pdf = {http://www.mcs.anl.gov/~wbland/pdf/Bland_2007_Design_and_Implementation_of_a_Menu_Based_OSCAR_Command_Line_Interface.pdf},
  doi = {10.1109/HPCS.2007.14}
}
@inproceedings{4215575,
  author = {Vallee, G. and Naughton, T. and Bland, W. and Scott, S.L.},
  booktitle = {High Performance Computing Systems and Applications, 2007. HPCS
        2007. 21st International Symposium on},
  title = {{Automatic Testing Tool for OSCAR Using System-level
        Virtualization}},
  year = {2007},
  pages = {26-26},
  keywords = {program testing;software quality;software tools;Linux;OSCAR
        command line interface;automatic testing tool;software
            development;software quality;system level virtualization;virtual
            cluster;Automatic testing;Computer
            architecture;Hardware;Linux;Operating systems;Software
            testing;System testing;Utility programs;Virtual machining;Voice
            mail},
  pdf = {http://www.mcs.anl.gov/~wbland/pdf/Vallee_2007_Automatic_Testing_Tool_for_OSCAR_Using_System-level_Virtualization.pdf},
  doi = {10.1109/HPCS.2007.9}
}