# miscellaneous tests for the CFFI backend
# that are not specific to any particular
# mod
import re
import pytest
import numpy as np
from numpy.testing import assert_array_equal, assert_allclose
import darshan
import darshan.backend.cffi_backend as backend
from darshan.backend.cffi_backend import ffi, libdutil, _structdefs
from darshan.log_utils import get_log_path
[docs]def test_get_lib_version():
# check for a reasonable version string
# returned by get_lib_version()
actual_version = backend.get_lib_version()
# must be a string
assert isinstance(actual_version, str)
# two periods in semantic version num
assert actual_version.count('.') == 2
# stricter regular expression match on
# the semantic version number
prog = re.compile(r"^\d+\.\d+\.\d+(-.+)?$")
match = prog.fullmatch(actual_version)
assert match is not None
assert match.group(0) == actual_version
[docs]@pytest.mark.parametrize(
"log_path",
[
# this log is the only one that returns a dataframe
# with 'int' file hashes
# NOTE: this case fails even before the fix
# because the fix enforces the data type uint64
"sample.darshan",
# these following 2 logs return dataframes with
# 'float' file hashes
"sample-goodost.darshan",
"sample-dxt-simple.darshan",
],
)
def test_file_hash_type(log_path):
# regression test for issue #438
# see: https://github.com/darshan-hpc/darshan/issues/438
# check that a single record generated by `log_get_generic_record`
# has the correct data type for the file hash/id
log_path = get_log_path(log_path)
log = backend.log_open(log_path)
rec = backend.log_get_generic_record(log=log, mod_name="POSIX", dtype="pandas")
# verify the records returned have the correct
# data type for the ids/hashes
assert rec["counters"]["id"].dtype == np.uint64
assert rec["fcounters"]["id"].dtype == np.uint64
# additionally check that the dataframes
# generated are of the correct types
with darshan.DarshanReport(log_path, read_all=True) as report:
report.mod_read_all_records("POSIX", dtype="pandas")
rec_counters = report.records["POSIX"][0]["counters"]
rec_fcounters = report.records["POSIX"][0]["fcounters"]
# verify the records returned have the correct
# data type for the ids/hashes
assert rec_counters["id"].dtype == np.uint64
assert rec_fcounters["id"].dtype == np.uint64
[docs]@pytest.mark.parametrize("dtype", ["numpy", "dict", "pandas"])
def test_log_get_generic_record(dtype):
# regression test for issue #440
# see: https://github.com/darshan-hpc/darshan/issues/440
# collect the expected counter/fcounter column names
expected_counter_names = backend.counter_names("POSIX")
expected_fcounter_names = backend.fcounter_names("POSIX")
# assign the expected counter/fcounter values
expected_counter_vals = np.array(
[
2049, -1, -1, 0, 16402, 16404, 0, 0, 0, 0, -1, -1, 0, 0, 0,
2199023259968, 0, 2199023261831, 0, 0, 0, 16384, 0, 0, 8,
16401, 1048576, 0, 134217728, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 4, 14, 0, 0, 0, 0, 0, 0, 16384, 0, 274743689216,
274743691264, 0, 0, 10240, 4096, 0, 0, 134217728, 272, 544,
328, 16384, 8, 2, 2, 597, 1073741824, 1312, 1073741824,
]
)
expected_fcounter_vals = np.array(
[
3.9191410541534424, 0.0, 3.940063953399658, 3.927093982696533,
3.936579942703247, 0.0, 115.0781660079956, 115.77035808563232,
0.0, 100397.60042190552, 11.300841808319092, 0.0,
17.940945863723755, 20.436099529266357, 85.47495031356812,
0.0, 0.0,
]
)
# generate a record from sample log
log = backend.log_open(get_log_path("sample.darshan"))
rec = backend.log_get_generic_record(log=log, mod_name="POSIX", dtype=dtype)
# each record should have the following keys
record_keys = ["id", "rank", "counters", "fcounters"]
assert list(rec.keys()) == record_keys
# check the file hash/id
assert rec["id"] == 6301063301082038805
# check the rank
assert rec["rank"] == -1
if dtype == "numpy":
# check the length of the returned arrays are correct
assert rec["counters"].size == 69
assert rec["fcounters"].size == 17
# collect the actual counter/fcounter values
actual_counter_vals = rec["counters"]
actual_fcounter_vals = rec["fcounters"]
elif dtype == "dict":
# check the length of the returned dictionaries are correct
assert len(rec["counters"]) == 69
assert len(rec["fcounters"]) == 17
# collect the actual counter/fcounter key names
actual_counter_names = list(rec["counters"].keys())
actual_fcounter_names = list(rec["fcounters"].keys())
# collect the actual counter/fcounter values
actual_counter_vals = np.array(list(rec["counters"].values()))
actual_fcounter_vals = np.array(list(rec["fcounters"].values()))
elif dtype == "pandas":
# make sure the added column keys are in the dataframes
for key in ["id", "rank"]:
assert key in rec["counters"].columns
assert key in rec["fcounters"].columns
# double check the id/rank values
assert rec["counters"]["id"].values == 6301063301082038805
assert rec["counters"]["rank"].values == -1
# make sure the dataframes are the expected shapes
# the shapes are 2 larger than the arrays since the id/rank
# columns are added to the dataframes
assert rec["counters"].shape == (1, 71)
assert rec["fcounters"].shape == (1, 19)
# collect the actual counter/fcounter key names
# don't include the id/rank columns
actual_counter_names = list(rec["counters"].columns)[2:]
actual_fcounter_names = list(rec["fcounters"].columns)[2:]
# collect the actual counter/fcounter values
# don't include the id/rank columns
actual_counter_vals = rec["counters"].values[0][2:]
actual_fcounter_vals = rec["fcounters"].values[0][2:]
# check the actual counter/fcounter values agree
# with the expected counter/fcounter values
assert_array_equal(actual_counter_vals, expected_counter_vals)
assert_allclose(actual_fcounter_vals, expected_fcounter_vals)
if dtype != "numpy":
# make sure the returned key/column names agree
assert actual_counter_names == expected_counter_names
assert actual_fcounter_names == expected_fcounter_names
[docs]@pytest.mark.parametrize("log_name", [
"imbalanced-io.darshan",
"e3sm_io_heatmap_only.darshan",
])
@pytest.mark.parametrize("module, index", [
("POSIX", 0),
("POSIX", 3),
("POSIX", 5),
("MPI-IO", 0),
("MPI-IO", 2),
# less records available for STDIO testing
# with these logs
("STDIO", 0),
])
def test_df_to_rec(log_name, index, module):
# test for packing a dataframe into a C-style record
# this is perhaps nothing more than a "round-trip" test
log_path = get_log_path(log_name)
with darshan.DarshanReport(log_path, read_all=True) as report:
report.mod_read_all_records(module, dtype="pandas")
rec_dict = report.records[module][0]
# id and rank are not formally included in the reconsituted
# (f)counters "buffer" so truncate a bit on comparison
expected_fcounters = rec_dict["fcounters"].iloc[index, 2:]
expected_counters = rec_dict["counters"].iloc[index, 2:].astype(np.int64)
expected_id = rec_dict["counters"].iloc[index, 0].astype(np.uint64)
expected_rank = rec_dict["counters"].iloc[index, 1]
# retrive the "re-packed"/actual record data:
rbuf = backend._df_to_rec(rec_dict, module, index)
rec_buf = ffi.from_buffer(_structdefs[module].replace("**", "*"), rbuf)
actual_fcounters = np.frombuffer(ffi.buffer(rec_buf[0].fcounters))
actual_counters = np.frombuffer(ffi.buffer(rec_buf[0].counters), dtype=np.int64)
actual_id = rec_buf[0].base_rec.id
actual_rank = rec_buf[0].base_rec.rank
assert_allclose(actual_fcounters, expected_fcounters)
assert_allclose(actual_counters, expected_counters)
assert actual_id == expected_id
assert actual_rank == expected_rank
[docs]@pytest.mark.parametrize("python_filter, expected_counts", [
# whether to do an initial filtering
# of the DataFrame in Python before
# packing it back into C records
(True, [18, 12, 2, 1]),
(False, [1026, 12, 2, 1]) # see gh-867
])
def test_reverse_record_array(python_filter, expected_counts):
# pack pandas DataFrame objects back into
# a contiguous buffer of several records
# and then use the darshan-util C lib accumulator
# on that record array, and compare the results
# with those discussed in gh-867 from Perl report
log_path = get_log_path("imbalanced-io.darshan")
with darshan.DarshanReport(log_path, read_all=True) as report:
nprocs = report.metadata['job']['nprocs']
modules = report.modules
report.mod_read_all_records("POSIX", dtype="pandas")
rec_dict = report.records["POSIX"][0]
counters_df = rec_dict["counters"]
fcounters_df = rec_dict["fcounters"]
if python_filter:
# gh-867 and the perl report filtered files that were
# only stat'd rather than opened, so demo the same filtering
# here at Python layer, then feed back to C accum stuff
fcounters_df = fcounters_df[counters_df["POSIX_OPENS"] > 0]
counters_df = counters_df[counters_df["POSIX_OPENS"] > 0]
rec_dict["counters"] = counters_df
rec_dict["fcounters"] = fcounters_df
num_recs = rec_dict["fcounters"].shape[0]
record_array = backend._df_to_rec(rec_dict, "POSIX")
# need to deal with the low-level C stuff to set up
# accumulator infrastructure to receive the repacked
# records
darshan_accumulator = ffi.new("darshan_accumulator *")
r = libdutil.darshan_accumulator_create(modules["POSIX"]['idx'],
nprocs,
darshan_accumulator)
assert r == 0
r_i = libdutil.darshan_accumulator_inject(darshan_accumulator[0], record_array, num_recs)
assert r_i == 0
derived_metrics = ffi.new("struct darshan_derived_metrics *")
summation_record = ffi.new(_structdefs["POSIX"].replace("**", "*"))
r = libdutil.darshan_accumulator_emit(darshan_accumulator[0],
derived_metrics,
summation_record)
assert r == 0
r = libdutil.darshan_accumulator_destroy(darshan_accumulator[0])
assert r == 0
# the indices into category_counters are pretty opaque.. we should just
# move everything to Python "eventually"... (also to avoid all the junk above after filtering..)
# 0 = total
# 1 = RO
# 2 = WO
# 3 = R/W
actual_total_files = derived_metrics.category_counters[0].count
actual_ro_files = derived_metrics.category_counters[1].count
actual_wo_files = derived_metrics.category_counters[2].count
actual_rw_files = derived_metrics.category_counters[3].count
assert_array_equal([actual_total_files,
actual_ro_files,
actual_wo_files,
actual_rw_files],
expected_counts)