Skip to content

Instantly share code, notes, and snippets.

@kunathj
Created February 7, 2022 19:57
Show Gist options
  • Save kunathj/a414bf9c82b99edcdc961b46737c77e7 to your computer and use it in GitHub Desktop.
Save kunathj/a414bf9c82b99edcdc961b46737c77e7 to your computer and use it in GitHub Desktop.
anajob
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Anajob\n",
"\n",
"Data produced with:\n",
"\n",
"```sh\n",
"source /cvmfs/ilc.desy.de/sw/x86_64_gcc82_centos7/v02-02-03/init_ilcsoft.sh\n",
"anajob rv02-02.sv02-02.mILD_l5_o1_v02.E250-SetA.I500002.P2f_z_eehiq.eL.pR.n000.d_dstm_15783_0.slcio > anajob.txt\n",
"```\n",
"\n",
"With `anajob`, we can get a file-level, event-level and collection-level overview."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import io\n",
"import matplotlib.colors as mcolors\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"with open(\"anajob.txt\") as f:\n",
" raw_txt = f.read()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def remove_prefix(full_str, prefix):\n",
" assert full_str.startswith(prefix)\n",
" return full_str[len(prefix):]\n",
"\n",
"\n",
"def remove_suffix(full_str, suffix):\n",
" assert full_str.endswith(suffix)\n",
" return full_str[:-len(suffix)]\n",
"\n",
"\n",
"\n",
"class Anajob:\n",
" _header_tag = \"anajob: will open and read from files: \\n\\n\"\n",
" _header_end_tag = \"\\n\\n will reopen and read from files: \\n\"\n",
" _run_tag = \"\\n Run : \"\n",
" _event_tag = \"///////////////////////////////////\\nEVENT: \"\n",
"\n",
" def __init__(self, raw_string, max_events=-1):\n",
" str_events = raw_string.split(self._event_tag)\n",
"\n",
" str_header = str_events.pop(0)\n",
" self.run_df = self._process_header(str_header)\n",
" self.n_events_used = self.n_events if max_events == -1 else int(max_events)\n",
" print(self.__repr__())\n",
"\n",
" self.event_header = {\"EVENT\": [], \"RUN\": [], \"DETECTOR\": []}\n",
" str_events[-1] = remove_suffix(str_events[-1],\n",
" f\"{self.n_events} events read from files: \\n {self.file_name}\\n\"\n",
" )\n",
" assert len(str_events) >= self.n_events_used\n",
" self.df = pd.concat(map(self._make_event, str_events[:self.n_events_used]))\n",
" self.event_header = pd.DataFrame(self.event_header).set_index(\"EVENT\")\n",
"\n",
" assert self.n_runs == len(self.event_header[\"RUN\"].unique())\n",
" assert self.n_events_used == len(self.event_header)\n",
" assert self.n_events_used == len(self.df[\"EVENT\"].unique())\n",
"\n",
"\n",
" def __repr__(self):\n",
" return \"\\n\".join([\n",
" f\"{__class__.__name__} with {self.n_events_used}/{self.n_events} \"\n",
" f\"events from file {self.file_name}\"\n",
" ])\n",
"\n",
"\n",
" def _make_event(self, str_event):\n",
" event_lines = str_event.split(\"\\n\")\n",
" i_event = int(event_lines.pop(0))\n",
" self.event_header[\"EVENT\"].append(i_event)\n",
" self.event_header[\"RUN\"].append(int(remove_prefix(event_lines.pop(0), \"RUN: \")))\n",
" self.event_header[\"DETECTOR\"].append(remove_prefix(event_lines.pop(0),\"DETECTOR: \"))\n",
" event_lines.pop(0) # COLLECTIONS: (see below)\n",
" event_lines.pop(0) # ///////////////////////////////////\n",
" event_lines.pop(0) #\n",
" event_lines.pop(0) # -----------------------------------\n",
" event_lines.pop(1) # ===================================\n",
" while event_lines[-1].strip(\" \") == \"\":\n",
" event_lines.pop()\n",
" event_lines.pop() # -----------------------------------\n",
" table_str = \"\\n\".join(event_lines).split(\" \")\n",
" table_str = filter(None, table_str)\n",
" table_str = map(lambda x: x.strip(\" \"), table_str)\n",
" table_str = \"\\t\".join(table_str)\n",
" table_str = table_str.replace(\"\\t\\n\", \"\\n\")\n",
" col_df = pd.read_csv(io.StringIO(table_str), sep=\"\\t\")\n",
" col_df[\"EVENT\"] = i_event\n",
" if not i_event % 1000: print(f\"{i_event:>6}/{self.n_events_used} events\\r\")\n",
" return col_df\n",
"\n",
"\n",
" def _process_header(self, str_header):\n",
" run_infos = str_header.split(self._run_tag)\n",
" header = run_infos.pop(0)\n",
" assert header.startswith(self._header_tag)\n",
" files = header[len(self._header_tag):].split(\"\\n\")\n",
" assert files.pop() == \"\"\n",
" assert len(files) == 1\n",
" for str_file in files:\n",
" self.file_name, str_file = str_file.strip().split(maxsplit=1)\n",
" str_file = str_file.strip()\n",
" assert str_file.startswith(\"[ \") and str_file.endswith(\" ]\")\n",
" self.n_runs, self.n_events = str_file[2:-2].split(\", \")\n",
" self.n_runs = int(remove_prefix(self.n_runs, \"number of runs: \"))\n",
" self.n_events = int(remove_prefix(self.n_events, \"number of events: \"))\n",
" run_infos[-1] = remove_suffix(run_infos[-1],\n",
" self._header_end_tag + \" \" + self.file_name + \"\\n\"\n",
" )\n",
" run_series = {}\n",
" for i, run_info in enumerate(run_infos):\n",
" run_series[i] = self._make_run(run_info)\n",
" return pd.concat(run_series, axis=1).transpose()\n",
"\n",
" def _make_run(self, str_run):\n",
" run_dict = {}\n",
" lines = str_run.split(\"\\n\")\n",
" assert lines.pop().strip().strip(\"-\") == \"\"\n",
" i_run, detector_tag = lines.pop(0).split(\" - \")\n",
" run_dict[\"RUN\"] = int(i_run)\n",
" run_dict[\"DETECTOR\"] = detector_tag.rstrip(\": \")\n",
" for line in lines:\n",
" key, val = line.split(\": \", maxsplit=1)\n",
" key = remove_prefix(key, \" parameter \")\n",
" key = remove_suffix(key, \" [string]\")\n",
" run_dict[key] = val.rstrip(\", \")\n",
" return pd.Series(run_dict)\n",
"\n",
"\n",
"aj = Anajob(raw_txt, max_events=-1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run information"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run_df = aj.run_df\n",
"run_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for k, v in run_df.transpose()[run_df.nunique().values == 1][0].items():\n",
" print(k, v)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run_df.transpose()[run_df.nunique().values != 1].transpose()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for v in run_df.transpose()[run_df.nunique().values != 1].transpose().outputFile.values:\n",
" print(v)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Event information"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aj.df.groupby(\"COLLECTION NAME\")[\"NUMBER OF ELEMENTS\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aj.df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(aj.event_header.nunique())\n",
"aj.event_header"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def plot_entries_per_collection(df):\n",
" per_collection = df.groupby(\"COLLECTION NAME\")[\"NUMBER OF ELEMENTS\"]\n",
" bins1 = np.arange(-0.5, 51, 1)\n",
" bins2 = np.arange(-0.5, 16)\n",
" bins3 = np.arange(-0.5, per_collection.max().max() + 0.5, 20)\n",
" fig, ax = plt.subplots(figsize=(12, 9))\n",
" axins1 = ax.inset_axes([0.30, 0.60, 0.69, 0.39], transform=ax.transAxes)\n",
" axins2 = ax.inset_axes([0.30, 0.17, 0.69, 0.39], transform=ax.transAxes)\n",
" kw = dict(histtype=\"step\", linewidth=2, density=True)\n",
" for i, collection in enumerate(per_collection.sum().sort_values(ascending=False).index):\n",
" x = df[df[\"COLLECTION NAME\"] == collection][\"NUMBER OF ELEMENTS\"]\n",
" kw[\"color\"] = list(mcolors.TABLEAU_COLORS)[i % 10]\n",
" kw[\"linestyle\"] = [\"-\", \":\", \"--\", \"-.\"][i // 10]\n",
" x.hist(bins=bins1, ax=ax, label=f\"{collection} ({x.mean():.1f})\", **kw)\n",
" x.hist(bins=bins2, ax=axins1, cumulative=-1, **kw)\n",
" x.hist(bins=bins3, ax=axins2, **kw)\n",
" axins1.patch.set_alpha(0.8)\n",
" axins2.patch.set_alpha(0.8)\n",
" ax.set_ylabel(\"pdf\")\n",
" axins1.set_ylabel(\"cdf\")\n",
" axins2.set_ylabel(\"pdf\")\n",
" axins2.set_yscale(\"log\")\n",
" ax.legend(title=\"COLLECTION NAME\", bbox_to_anchor=(1.0, 1.0))\n",
" fig.tight_layout()\n",
" return fig\n",
"\n",
"fig = plot_entries_per_collection(aj.df)\n",
"fig.savefig(\"collection_entries.png\", dpi=300, facefolor=\"white\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "da01a39d7be8ddd16a36886b47c621589cc5adc6987b4295cc334135f1184bcd"
},
"kernelspec": {
"display_name": "Python 3.8.3 ('bdt')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
@kunathj
Copy link
Author

kunathj commented Feb 7, 2022

collection_entries

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment