Source code for darshan.tests.test_cffi_misc

# miscellaneous tests for the CFFI backend
# that are not specific to any particular
# mod

import re

import pytest
import numpy as np
from numpy.testing import assert_array_equal, assert_allclose
import darshan
import darshan.backend.cffi_backend as backend
from darshan.backend.cffi_backend import ffi, libdutil, _structdefs
from darshan.log_utils import get_log_path

[docs]def test_get_lib_version():
    # check for a reasonable version string
    # returned by get_lib_version()
    actual_version = backend.get_lib_version()
    # must be a string
    assert isinstance(actual_version, str)
    # two periods in semantic version num
    assert actual_version.count('.') == 2
    # stricter regular expression match on
    # the semantic version number
    prog = re.compile(r"^\d+\.\d+\.\d+(-.+)?$")
    match = prog.fullmatch(actual_version)
    assert match is not None
    assert match.group(0) == actual_version


[docs]@pytest.mark.parametrize(
    "log_path",
    [
        # this log is the only one that returns a dataframe
        # with 'int' file hashes
        # NOTE: this case fails even before the fix
        # because the fix enforces the data type uint64
        "sample.darshan",
        # these following 2 logs return dataframes with
        # 'float' file hashes
        "sample-goodost.darshan",
        "sample-dxt-simple.darshan",
    ],
)
def test_file_hash_type(log_path):
    # regression test for issue #438
    # see: https://github.com/darshan-hpc/darshan/issues/438

    # check that a single record generated by `log_get_generic_record`
    # has the correct data type for the file hash/id
    log_path = get_log_path(log_path)
    log = backend.log_open(log_path)
    rec = backend.log_get_generic_record(log=log, mod_name="POSIX", dtype="pandas")
    # verify the records returned have the correct
    # data type for the ids/hashes
    assert rec["counters"]["id"].dtype == np.uint64
    assert rec["fcounters"]["id"].dtype == np.uint64

    # additionally check that the dataframes
    # generated are of the correct types
    with darshan.DarshanReport(log_path, read_all=True) as report:
        report.mod_read_all_records("POSIX", dtype="pandas")
        rec_counters = report.records["POSIX"][0]["counters"]
        rec_fcounters = report.records["POSIX"][0]["fcounters"]
    # verify the records returned have the correct
    # data type for the ids/hashes
    assert rec_counters["id"].dtype == np.uint64
    assert rec_fcounters["id"].dtype == np.uint64


[docs]@pytest.mark.parametrize("dtype", ["numpy", "dict", "pandas"])
def test_log_get_generic_record(dtype):
    # regression test for issue #440
    # see: https://github.com/darshan-hpc/darshan/issues/440

    # collect the expected counter/fcounter column names
    expected_counter_names = backend.counter_names("POSIX")
    expected_fcounter_names = backend.fcounter_names("POSIX")

    # assign the expected counter/fcounter values
    expected_counter_vals = np.array(
        [
            2049, -1, -1, 0, 16402, 16404, 0, 0, 0, 0, -1, -1, 0, 0, 0,
            2199023259968, 0, 2199023261831, 0, 0, 0, 16384, 0, 0, 8,
            16401, 1048576, 0, 134217728, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 4, 14, 0, 0, 0, 0, 0, 0, 16384, 0, 274743689216,
            274743691264, 0, 0, 10240, 4096, 0, 0, 134217728, 272, 544,
            328, 16384, 8, 2, 2, 597, 1073741824, 1312, 1073741824,
        ]
    )
    expected_fcounter_vals = np.array(
        [
            3.9191410541534424, 0.0, 3.940063953399658, 3.927093982696533,
            3.936579942703247, 0.0, 115.0781660079956, 115.77035808563232,
            0.0, 100397.60042190552, 11.300841808319092, 0.0,
            17.940945863723755, 20.436099529266357, 85.47495031356812,
            0.0, 0.0,
        ]
    )

    # generate a record from sample log
    log = backend.log_open(get_log_path("sample.darshan"))
    rec = backend.log_get_generic_record(log=log, mod_name="POSIX", dtype=dtype)

    # each record should have the following keys
    record_keys = ["id", "rank", "counters", "fcounters"]
    assert list(rec.keys()) == record_keys
    # check the file hash/id
    assert rec["id"] == 6301063301082038805
    # check the rank
    assert rec["rank"] == -1

    if dtype == "numpy":
        # check the length of the returned arrays are correct
        assert rec["counters"].size == 69
        assert rec["fcounters"].size == 17
        # collect the actual counter/fcounter values
        actual_counter_vals = rec["counters"]
        actual_fcounter_vals = rec["fcounters"]

    elif dtype == "dict":
        # check the length of the returned dictionaries are correct
        assert len(rec["counters"]) == 69
        assert len(rec["fcounters"]) == 17
        # collect the actual counter/fcounter key names
        actual_counter_names = list(rec["counters"].keys())
        actual_fcounter_names = list(rec["fcounters"].keys())
        # collect the actual counter/fcounter values
        actual_counter_vals = np.array(list(rec["counters"].values()))
        actual_fcounter_vals = np.array(list(rec["fcounters"].values()))

    elif dtype == "pandas":
        # make sure the added column keys are in the dataframes
        for key in ["id", "rank"]:
            assert key in rec["counters"].columns
            assert key in rec["fcounters"].columns
        # double check the id/rank values
        assert rec["counters"]["id"].values == 6301063301082038805
        assert rec["counters"]["rank"].values == -1
        # make sure the dataframes are the expected shapes
        # the shapes are 2 larger than the arrays since the id/rank
        # columns are added to the dataframes
        assert rec["counters"].shape == (1, 71)
        assert rec["fcounters"].shape == (1, 19)
        # collect the actual counter/fcounter key names
        # don't include the id/rank columns
        actual_counter_names = list(rec["counters"].columns)[2:]
        actual_fcounter_names = list(rec["fcounters"].columns)[2:]
        # collect the actual counter/fcounter values
        # don't include the id/rank columns
        actual_counter_vals = rec["counters"].values[0][2:]
        actual_fcounter_vals = rec["fcounters"].values[0][2:]

    # check the actual counter/fcounter values agree
    # with the expected counter/fcounter values
    assert_array_equal(actual_counter_vals, expected_counter_vals)
    assert_allclose(actual_fcounter_vals, expected_fcounter_vals)

    if dtype != "numpy":
        # make sure the returned key/column names agree
        assert actual_counter_names == expected_counter_names
        assert actual_fcounter_names == expected_fcounter_names


[docs]@pytest.mark.parametrize("log_name", [
    "imbalanced-io.darshan",
    "e3sm_io_heatmap_only.darshan",
    ])
@pytest.mark.parametrize("module, index", [
    ("POSIX", 0),
    ("POSIX", 3),
    ("POSIX", 5),
    ("MPI-IO", 0),
    ("MPI-IO", 2),
    # less records available for STDIO testing
    # with these logs
    ("STDIO", 0),
    ])
def test_df_to_rec(log_name, index, module):
    # test for packing a dataframe into a C-style record
    # this is perhaps nothing more than a "round-trip" test
    log_path = get_log_path(log_name)
    with darshan.DarshanReport(log_path, read_all=True) as report:
        report.mod_read_all_records(module, dtype="pandas")
        rec_dict = report.records[module][0]

    # id and rank are not formally included in the reconsituted
    # (f)counters "buffer" so truncate a bit on comparison
    expected_fcounters = rec_dict["fcounters"].iloc[index, 2:]
    expected_counters = rec_dict["counters"].iloc[index, 2:].astype(np.int64)
    expected_id = rec_dict["counters"].iloc[index, 0].astype(np.uint64)
    expected_rank = rec_dict["counters"].iloc[index, 1]

    # retrive the "re-packed"/actual record data:
    rbuf = backend._df_to_rec(rec_dict, module, index)
    rec_buf = ffi.from_buffer(_structdefs[module].replace("**", "*"), rbuf)
    actual_fcounters = np.frombuffer(ffi.buffer(rec_buf[0].fcounters))
    actual_counters = np.frombuffer(ffi.buffer(rec_buf[0].counters), dtype=np.int64)
    actual_id = rec_buf[0].base_rec.id
    actual_rank = rec_buf[0].base_rec.rank


    assert_allclose(actual_fcounters, expected_fcounters)
    assert_allclose(actual_counters, expected_counters)
    assert actual_id == expected_id
    assert actual_rank == expected_rank


[docs]@pytest.mark.parametrize("python_filter, expected_counts", [
    # whether to do an initial filtering
    # of the DataFrame in Python before
    # packing it back into C records
    (True, [18, 12, 2, 1]),
    (False, [1026, 12, 2, 1]) # see gh-867
    ])
def test_reverse_record_array(python_filter, expected_counts):
    # pack pandas DataFrame objects back into
    # a contiguous buffer of several records
    # and then use the darshan-util C lib accumulator
    # on that record array, and compare the results
    # with those discussed in gh-867 from Perl report
    log_path = get_log_path("imbalanced-io.darshan")
    with darshan.DarshanReport(log_path, read_all=True) as report:
        nprocs = report.metadata['job']['nprocs']
        modules = report.modules
        report.mod_read_all_records("POSIX", dtype="pandas")
        rec_dict = report.records["POSIX"][0]
    counters_df = rec_dict["counters"]
    fcounters_df = rec_dict["fcounters"]
    if python_filter:
        # gh-867 and the perl report filtered files that were
        # only stat'd rather than opened, so demo the same filtering
        # here at Python layer, then feed back to C accum stuff
        fcounters_df = fcounters_df[counters_df["POSIX_OPENS"] > 0]
        counters_df = counters_df[counters_df["POSIX_OPENS"] > 0]
        rec_dict["counters"] = counters_df
        rec_dict["fcounters"] = fcounters_df
    num_recs = rec_dict["fcounters"].shape[0]
    record_array = backend._df_to_rec(rec_dict, "POSIX")

	# need to deal with the low-level C stuff to set up
    # accumulator infrastructure to receive the repacked
    # records
    darshan_accumulator = ffi.new("darshan_accumulator *")
    r = libdutil.darshan_accumulator_create(modules["POSIX"]['idx'],
                                            nprocs,
                                            darshan_accumulator)
    assert r == 0
    r_i = libdutil.darshan_accumulator_inject(darshan_accumulator[0], record_array, num_recs)
    assert r_i == 0
    derived_metrics = ffi.new("struct darshan_derived_metrics *")
    summation_record = ffi.new(_structdefs["POSIX"].replace("**", "*"))
    r = libdutil.darshan_accumulator_emit(darshan_accumulator[0],
                                          derived_metrics,
                                          summation_record)
    assert r == 0
    r = libdutil.darshan_accumulator_destroy(darshan_accumulator[0])
    assert r == 0

    # the indices into category_counters are pretty opaque.. we should just
    # move everything to Python "eventually"... (also to avoid all the junk above after filtering..)
    # 0 = total
    # 1 = RO
    # 2 = WO
    # 3 = R/W
    actual_total_files = derived_metrics.category_counters[0].count
    actual_ro_files = derived_metrics.category_counters[1].count
    actual_wo_files = derived_metrics.category_counters[2].count
    actual_rw_files = derived_metrics.category_counters[3].count
    assert_array_equal([actual_total_files,
                        actual_ro_files,
                        actual_wo_files,
                        actual_rw_files],
                        expected_counts)