"""
Class for Logfiles
"""
import datetime
import pandas as pd
[docs]class Logfile(object):
"""Makes a Pandas Dataframe from a logfile"""
def __init__(self, log, esm_style=True):
self.log = log
if esm_style:
self.log_df = self._generate_dataframe_from_esm_logfile()
else:
self.log_df = self._generate_dataframe_from_mpimet_logfile()
del self.log
def _generate_dataframe_from_esm_logfile(self):
df = pd.DataFrame(
[l.split(" : ") for l in self.log], columns=["Date", "Message"]
)
df2 = df["Message"].str.split(expand=True)
# We drop the first row since it says "Start of Experiment"
log_df = pd.concat([df[1:]["Date"], df2[1:]], axis=1)
# Checks if Experiment over is in last row and drops it if needed:
lastrow = df2.tail(1)
if "Experimentover" in str(lastrow).replace(" ", ""):
log_df = log_df.head(-1)
log_df.columns = [
"Date",
"Run Number",
"Exp Date",
"Job ID",
"Seperator",
"State",
]
log_df.drop("Seperator", axis=1, inplace=True)
log_df.set_index("Date", inplace=True)
log_df.index = pd.to_datetime(log_df.index)
return log_df
def _generate_dataframe_from_mpimet_logfile(self):
log_df = pd.read_table(
self.log,
sep=r" : | -",
skiprows=1,
infer_datetime_format=True,
names=["Date", "Message", "State"],
engine="python",
index_col=0,
)
middle_column = log_df["Message"].apply(lambda x: pd.Series(str(x).split()))
log_df.drop("Message", axis=1, inplace=True)
middle_column.columns = ["Run Number", "Exp Date", "Job ID"]
log_df = pd.concat([log_df, middle_column], axis=1)
# FIXME: This needs a context manager to try different locales
log_df.set_index(pd.to_datetime(log_df.index), inplace=True)
return log_df
[docs] @classmethod
def from_file(cls, fin):
with open(fin) as f:
log = f.readlines()
return cls(log)
[docs] def compute_throughput(self):
starts = self.log_df[self.log_df.State.str.contains("start")]
ends = self.log_df[self.log_df.State.str.contains("done")]
# Drop the duplicated starts:
starts.drop_duplicates(subset="Run Number", keep="last", inplace=True)
merged = pd.concat([starts, ends])
groupby = merged.groupby("Run Number")
run_diffs = {"Run Number": [], "Wall Time": [], "Queue Time": []}
for name, group in groupby:
if int(name) > 1:
previous_group = groupby.get_group(str(int(name) - 1))
run_diffs["Queue Time"].append(
group.index[0] - previous_group.index[-1]
)
else:
run_diffs["Queue Time"].append(datetime.timedelta(0))
run_diffs["Run Number"].append(int(name))
run_diffs["Wall Time"].append(group.index[-1] - group.index[0])
diffs = (
pd.DataFrame(run_diffs).sort_values("Run Number").set_index("Run Number")
)
throughput = (datetime.timedelta(1) / diffs.mean())["Wall Time"]
return pd.DataFrame({"Simulation Average": diffs.mean()}), throughput, diffs
[docs] def run_stats(self):
_, _, diffs = self.compute_throughput()
last_ten_diffs = diffs.tail(10)
throughput = datetime.timedelta(1) / last_ten_diffs["Wall Time"].mean()
efficiency = last_ten_diffs["Wall Time"].mean() / (
last_ten_diffs["Queue Time"].mean() + last_ten_diffs["Wall Time"].mean()
)
df = pd.DataFrame.from_dict(
{
"Mean Walltime": last_ten_diffs["Wall Time"].mean(),
"Mean Queuing Time": last_ten_diffs["Queue Time"].mean(),
"Optimal Throughput": throughput,
"Actual Throughput (Last 10 Runs)": throughput * efficiency,
"Run Efficiency (Last 10 Runs)": efficiency * 100,
},
orient="index",
# columns=["Run Statistics"],
)
return df