Source code for darshan.tests.test_plot_io_cost

from packaging import version
import pytest
import numpy as np
from numpy.testing import assert_allclose, assert_array_equal
import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal
import matplotlib

import darshan
from darshan.log_utils import get_log_path
from darshan.experimental.plots.plot_io_cost import (
    get_by_avg_series,
    get_io_cost_df,
    plot_io_cost,
    combine_hdf5_modules,
)


[docs]@pytest.mark.parametrize(
    "logname, expected_df",
    [
        (
            "ior_hdf5_example.darshan",
            pd.DataFrame(
                np.array([
                    [0.0196126699, 0.1342029571533203, 0.0074423551, 0.0],
                    [0.0196372866, 0.13425052165985107, 0.0475, 0.0],
                    [0.016869, 0.086689, 0.097160, 0.0],
                    [0.0, 2.5570392608642578e-05, 0.0, 0.0],
                ]),
                ["POSIX", "MPIIO", "HDF5", "STDIO"],
                ["Read", "Write", "Meta", "Wait"],
            ),
        ),
        (
            "sample-badost.darshan",
            pd.DataFrame(
                np.array([
                    [0.0, 33.48587587394286, 0.5547398688504472, 0.0],
                    [0.011203573201783001, 4.632166e-07, 0.135187, 0.0],
                ]),
                ["POSIX", "STDIO"],
                ["Read", "Write", "Meta", "Wait"],
            ),
        ),
        (
            "shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan",
            pd.DataFrame(
                np.array([
                [0.000378787518, 0.002514898777, 0.000068306923, 0.0],
                [0.000397562981, 0.002540826797, 0.001559376717, 0.0],
                [0.000402510166, 0.002579867840, 0.001994967461, 0.0],
                [0.000000000000, 0.000120997429, 0.000000000000, 0.0],
                ]),
                ["POSIX", "MPIIO", "PNETCDF", "STDIO"],
                ["Read", "Write", "Meta", "Wait"],
            ),
        ),
    ],
)
def test_get_io_cost_df(logname, expected_df):
    # regression test for `plot_io_cost.get_io_cost_df()`
    with darshan.DarshanReport(get_log_path(logname)) as report:
        actual_df = get_io_cost_df(report=report)
    assert_frame_equal(actual_df, expected_df)


[docs]@pytest.mark.parametrize(
    "logname, expected_ylims", [
        (
            "ior_hdf5_example.darshan",
            [0.0, 1.0],
        ),
        (
            "sample-badost.darshan",
            [0.0, 780.0],
        ),
        (
            "dxt.darshan",
            [0.0, 1469.0],
        ),
        (
            "noposix.darshan",
            [0.0, 39213.0],
        ),
        (
            "noposixopens.darshan",
            [0.0, 1111.0],
        ),
    ],
)
def test_plot_io_cost_ylims(logname, expected_ylims):
    # test the y limits for both axes for the IO cost stacked bar graph

    with darshan.DarshanReport(get_log_path(logname)) as report:
        fig = plot_io_cost(report=report)
    for i, ax in enumerate(fig.axes):
        # there are only 2 axes, the first being the "raw" data
        # and the second being the normalized data (percent)
        actual_ylims = ax.get_ylim()
        if i == 0:
            assert_allclose(actual_ylims, expected_ylims)
        else:
            # normalized data is always the same
            assert_allclose(actual_ylims, [0.0, 100.0])

[docs]@pytest.mark.parametrize(
    "logname, expected_yticks", [
        (
            "ior_hdf5_example.darshan",
            [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
        ),
        (
            "sample-badost.darshan",
            [0, 156, 312, 468, 624, 780],
        ),
    ],
)
def test_plot_io_cost_y_ticks_and_labels(logname, expected_yticks):
    # check the y-axis tick marks are at the appropriate
    # locations and the labels are as expected

    # create the expected y-axis tick labels from the y ticks
    expected_yticklabels = [str(i) for i in expected_yticks]

    logpath = get_log_path(logname)
    with darshan.DarshanReport(logpath) as report:
        fig = plot_io_cost(report=report)
    for i, ax in enumerate(fig.axes):
        # there are only 2 axes, the first being the "raw" data
        # and the second being the normalized data (percent)
        actual_yticks = ax.get_yticks()
        yticklabels = ax.get_yticklabels()
        actual_yticklabels = [tl.get_text() for tl in yticklabels]
        if i == 0:
            assert_allclose(actual_yticks, expected_yticks)
            assert_array_equal(actual_yticklabels, expected_yticklabels)
        else:
            # normalized data always has the same 5 tick labels
            assert_array_equal(actual_yticks, [0, 20, 40, 60, 80, 100])
            assert_array_equal(
                actual_yticklabels,
                ["0%", "20%", "40%", "60%", "80%", "100%"],
            )


[docs]@pytest.mark.parametrize("mod_key, input_df, expected_series", [
    (
        # generate a dataframe that has easy-to-calculate average values
        # to check if the averages are being calculated appropriately
        "POSIX",
        pd.DataFrame(
            data=[
                [0, 1, 10, 3],
                [12, 5, 20, 3]
            ],
            columns=[
                "POSIX_F_READ_TIME", "POSIX_F_WRITE_TIME",
                "POSIX_F_META_TIME", "TEST"
            ],
        ),
        pd.Series(
            data=[1.2, .6, 3.0, 0.0],
            index=["Read", "Write", "Meta", "Wait"],
        ),
    ),
    (
        # generate a dataframe similar to a shared-record-enabled log
        # where there is a single entry that needs to be divided through
        # by `nprocs`
        "STDIO",
        pd.DataFrame(
            data=[
                [30000, 3000, 300, 10],
            ],
            columns=[
                "STDIO_F_READ_TIME", "STDIO_F_WRITE_TIME",
                "STDIO_F_META_TIME", "TEST"
            ],
        ),
        pd.Series(
            data=[3000.0, 300.0, 30.0, 0.0],
            index=["Read", "Write", "Meta", "Wait"],
        ),
    ),
    (
        # combine 2 previous cases to check if
        # calculations are being done appropriately
        "MPIIO",
        pd.DataFrame(
            data=[
                [0, 1, 10, 3],
                [12, 5, 20, 3],
                [30000, 3000, 300, 10],
            ],
            columns=[
                "MPIIO_F_READ_TIME", "MPIIO_F_WRITE_TIME",
                "MPIIO_F_META_TIME", "TEST"
            ],
        ),
        pd.Series(
            data=[3001.2, 300.6, 33.0, 0.0],
            index=["Read", "Write", "Meta", "Wait"],
        ),
    )
])
def test_get_by_avg_series(mod_key, input_df, expected_series):
    # unit test for `plot_io_cost.get_by_avg_series`
    actual_series = get_by_avg_series(df=input_df, mod_key=mod_key, nprocs=10)
    assert_series_equal(actual_series, expected_series)


[docs]@pytest.mark.parametrize(
    "filename, expected_df",
    [
        (
            "nonmpi_dxt_anonymized.darshan",
            pd.DataFrame(
                np.array([
                    [0.281718, 0.504260, 0.170138, 0.0],
                    [0.232386, 0.165982, 0.072751, 0.0],
                ]),
                ["POSIX", "STDIO"],
                ["Read", "Write", "Meta", "Wait"],
            ),
        ),
    ])
def test_issue_590(filename, expected_df):
    # regression test for issue #590
    # see: https://github.com/darshan-hpc/darshan/issues/590
    log_path = get_log_path(filename)
    with darshan.DarshanReport(log_path) as report:
        actual_df = get_io_cost_df(report=report)
    assert_frame_equal(actual_df, expected_df)


[docs]@pytest.mark.parametrize(
    "input_df, expected_df",
    [
        # if input dataframe does not contain HDF5 module
        # it should remain unchanged
        (
            pd.DataFrame([[10.0, 20.0, 30.0]], index=["POSIX"]),
            pd.DataFrame([[10.0, 20.0, 30.0]], index=["POSIX"]),
        ),
        # for cases where only "H5F" data is present, it should
        # effectively get renamed to "HDF5"
        (
            pd.DataFrame([[10.0, 20.0, 30.0], [0.1, 0.2, 0.3]], index=["POSIX", "H5F"]),
            pd.DataFrame([[10.0, 20.0, 30.0], [0.1, 0.2, 0.3]], index=["POSIX", "HDF5"]),
        ),
        # for cases with both HDF5 modules, the resultant HDF5 entry
        # should be the sum of the `H5F` and `H5D` rows
        (
            pd.DataFrame(
                [
                    [10.0, 20.0, 30.0],
                    [0.1, 0.2, 0.3],
                    [0.9, 0.8, 0.7],
                ],
                index=["POSIX", "H5F", "H5D"],
            ),
            pd.DataFrame([[10.0, 20.0, 30.0], [1.0, 1.0, 1.0]], index=["POSIX", "HDF5"]),
        ),
    ])
def test_combine_hdf5_modules(input_df, expected_df):
    # `plot_io_cost.combine_hdf5_modules()` unit test

    # add the proper column names to the input and expected dataframes
    for df in (input_df, expected_df):
        df.columns = ["Read", "Write", "Meta"]

    actual_df = combine_hdf5_modules(input_df)

    # check actual and expected dataframes are identical
    assert_frame_equal(actual_df, expected_df)


[docs]@pytest.mark.parametrize(
    "logname, expected_xticks, expected_xlabels", [
        (
            "shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan",
            range(4),
            ["POSIX", "MPIIO", "PNETCDF", "STDIO"]
        ),
        (
            "imbalanced-io.darshan",
            range(3),
            ["POSIX", "MPIIO", "STDIO"]
        ),
    ],
)
def test_plot_io_cost_x_ticks_and_labels(logname,
                                         expected_xticks,
                                         expected_xlabels):
    # check the x-axis tick marks are at the appropriate
    # locations and the labels are as expected

    logpath = get_log_path(logname)
    with darshan.DarshanReport(logpath) as report:
        fig = plot_io_cost(report=report)
    for i, ax in enumerate(fig.axes):
        if i > 0 and version.parse(matplotlib.__version__) < version.parse("3.6.0"):
            # the second (invisible/twinned) axis is effectively
            # empty in older matplotlib versions
            continue
        # there are only 2 axes, the first being the "raw" data
        # and the second being the normalized data (percent)
        actual_xticks = ax.get_xticks()
        xticklabels = ax.get_xticklabels()
        actual_xticklabels = [tl.get_text() for tl in xticklabels]
        assert_allclose(actual_xticks, expected_xticks)
        assert_array_equal(actual_xticklabels, expected_xlabels)
        # regression test for gh-881
        expected_rotations = 90
        x_rotations = [tl.get_rotation() for tl in xticklabels]
        assert_allclose(x_rotations, expected_rotations)