Source code for darshan.experimental.plots.plot_common_access_table

from typing import Any, List

import pandas as pd

import darshan


[docs]def remove_nonzero_rows(df: Any) -> Any: """ Removes dataframe rows that contain all zero values. Parameters ---------- df: a ``pd.DataFrame``. Returns ------- A ``pd.DataFrame`` containing a subset of rows from the input dataframe, where each row contains at least 1 non-zero value. """ return df.loc[(df != 0).any(axis=1)]
[docs]def combine_access_sizes(df: Any) -> Any: """ Combines rows with identical values in the "Access Size" column and calculates the sum for all other numeric columns. Parameters ---------- df: a ``pd.DataFrame`` with a column named "Access Size". Returns ------- A ``pd.DataFrame`` where "Access Size" column is the index and remaining columns contain the summed data from grouped rows. """ return df.groupby("Access Size").sum().reset_index()
[docs]def get_most_common_access_sizes(df: Any, n_rows: int = 4) -> Any: """ Returns the rows with the `n_rows` largest "Count" values. Parameters ---------- df: a ``pd.DataFrame`` with a column named "Count". n_rows: number of rows to keep. Returns ------- A ``pd.DataFrame`` containing the largest `n_rows` "Count" values. """ return df.sort_values(by="Count", ascending=False, ignore_index=True).head(n=n_rows)
[docs]def collapse_access_cols(df: Any, col_name: str) -> Any: """ Collapses all columns into a single column named `col_name`. Parameters ---------- df: a ``pd.DataFrame``. col_name: name of new column to store collapsed data. Returns ------- A ``pd.DataFrame`` containing all data collapsed into column `col_name`. """ return df.melt(value_name=col_name).drop("variable", axis=1)
[docs]def get_access_count_df(mod_df: Any, mod: str) -> Any: """ Creates a dataframe containing only the access size and count data. Parameters ---------- mod_df: "counters" dataframe for the input module `mod` from a ``darshan.DarshanReport``. mod: the module to obtain the common accesses table for (i.e "POSIX", "MPI-IO", "H5D"). Returns ------- A ``pd.DataFrame`` containing all access size data and their respective counts. """ col_dict = {"ACCESS": "Access Size", "COUNT": "Count"} df_list: List = [] for counter_name, col_name in col_dict.items(): # filter out any columns not related to the access sizes filter_keys = [f"{mod}_ACCESS{i}_{counter_name}" for i in range(1, 5)] df = mod_df.filter(filter_keys) df = collapse_access_cols(df=df, col_name=col_name) df_list.append(df) return pd.concat(df_list, axis=1)
[docs]class DarshanReportTable: """ Stores table figures in dataframe and html formats. Parameters ---------- df: a ``pd.DataFrame``. kwargs: keyword arguments passed to ``pd.DataFrame.to_html()``. """ def __init__(self, df: Any, **kwargs): self.df = df self.html = self.df.to_html(**kwargs)
[docs]def plot_common_access_table(report: darshan.DarshanReport, mod: str, n_rows: int = 4) -> DarshanReportTable: """ Creates a table containing the most common access sizes and their counts. Parameters ---------- report: a ``darshan.DarshanReport``. mod: the module to obtain the common access size table for (i.e "POSIX", "MPI-IO", "H5D"). n_rows: number of rows to keep. Returns ------- common_access_table: a ``DarshanReportTable`` containing the `n_rows` most common access sizes and their counts for the specified module. The table is sorted in descending order based on the access size count and can be retrieved as either a ``pd.DataFrame`` or html table via the `df` or `html` attributes, respectively. """ mod_df = report.records[mod].to_df(attach=None)["counters"] if mod == "MPI-IO": mod = "MPIIO" df = get_access_count_df(mod_df=mod_df, mod=mod) df = remove_nonzero_rows(df=df) df = combine_access_sizes(df=df) df = get_most_common_access_sizes(df=df, n_rows=n_rows) common_access_table = DarshanReportTable( # remove index labels and remove border df=df, index=False, border=0, ) return common_access_table