Created
February 7, 2022 19:57
-
-
Save kunathj/a414bf9c82b99edcdc961b46737c77e7 to your computer and use it in GitHub Desktop.
anajob
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Anajob\n", | |
"\n", | |
"Data produced with:\n", | |
"\n", | |
"```sh\n", | |
"source /cvmfs/ilc.desy.de/sw/x86_64_gcc82_centos7/v02-02-03/init_ilcsoft.sh\n", | |
"anajob rv02-02.sv02-02.mILD_l5_o1_v02.E250-SetA.I500002.P2f_z_eehiq.eL.pR.n000.d_dstm_15783_0.slcio > anajob.txt\n", | |
"```\n", | |
"\n", | |
"With `anajob`, we can get a file-level, event-level and collection-level overview." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import io\n", | |
"import matplotlib.colors as mcolors\n", | |
"import matplotlib.pyplot as plt\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"with open(\"anajob.txt\") as f:\n", | |
" raw_txt = f.read()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def remove_prefix(full_str, prefix):\n", | |
" assert full_str.startswith(prefix)\n", | |
" return full_str[len(prefix):]\n", | |
"\n", | |
"\n", | |
"def remove_suffix(full_str, suffix):\n", | |
" assert full_str.endswith(suffix)\n", | |
" return full_str[:-len(suffix)]\n", | |
"\n", | |
"\n", | |
"\n", | |
"class Anajob:\n", | |
" _header_tag = \"anajob: will open and read from files: \\n\\n\"\n", | |
" _header_end_tag = \"\\n\\n will reopen and read from files: \\n\"\n", | |
" _run_tag = \"\\n Run : \"\n", | |
" _event_tag = \"///////////////////////////////////\\nEVENT: \"\n", | |
"\n", | |
" def __init__(self, raw_string, max_events=-1):\n", | |
" str_events = raw_string.split(self._event_tag)\n", | |
"\n", | |
" str_header = str_events.pop(0)\n", | |
" self.run_df = self._process_header(str_header)\n", | |
" self.n_events_used = self.n_events if max_events == -1 else int(max_events)\n", | |
" print(self.__repr__())\n", | |
"\n", | |
" self.event_header = {\"EVENT\": [], \"RUN\": [], \"DETECTOR\": []}\n", | |
" str_events[-1] = remove_suffix(str_events[-1],\n", | |
" f\"{self.n_events} events read from files: \\n {self.file_name}\\n\"\n", | |
" )\n", | |
" assert len(str_events) >= self.n_events_used\n", | |
" self.df = pd.concat(map(self._make_event, str_events[:self.n_events_used]))\n", | |
" self.event_header = pd.DataFrame(self.event_header).set_index(\"EVENT\")\n", | |
"\n", | |
" assert self.n_runs == len(self.event_header[\"RUN\"].unique())\n", | |
" assert self.n_events_used == len(self.event_header)\n", | |
" assert self.n_events_used == len(self.df[\"EVENT\"].unique())\n", | |
"\n", | |
"\n", | |
" def __repr__(self):\n", | |
" return \"\\n\".join([\n", | |
" f\"{__class__.__name__} with {self.n_events_used}/{self.n_events} \"\n", | |
" f\"events from file {self.file_name}\"\n", | |
" ])\n", | |
"\n", | |
"\n", | |
" def _make_event(self, str_event):\n", | |
" event_lines = str_event.split(\"\\n\")\n", | |
" i_event = int(event_lines.pop(0))\n", | |
" self.event_header[\"EVENT\"].append(i_event)\n", | |
" self.event_header[\"RUN\"].append(int(remove_prefix(event_lines.pop(0), \"RUN: \")))\n", | |
" self.event_header[\"DETECTOR\"].append(remove_prefix(event_lines.pop(0),\"DETECTOR: \"))\n", | |
" event_lines.pop(0) # COLLECTIONS: (see below)\n", | |
" event_lines.pop(0) # ///////////////////////////////////\n", | |
" event_lines.pop(0) #\n", | |
" event_lines.pop(0) # -----------------------------------\n", | |
" event_lines.pop(1) # ===================================\n", | |
" while event_lines[-1].strip(\" \") == \"\":\n", | |
" event_lines.pop()\n", | |
" event_lines.pop() # -----------------------------------\n", | |
" table_str = \"\\n\".join(event_lines).split(\" \")\n", | |
" table_str = filter(None, table_str)\n", | |
" table_str = map(lambda x: x.strip(\" \"), table_str)\n", | |
" table_str = \"\\t\".join(table_str)\n", | |
" table_str = table_str.replace(\"\\t\\n\", \"\\n\")\n", | |
" col_df = pd.read_csv(io.StringIO(table_str), sep=\"\\t\")\n", | |
" col_df[\"EVENT\"] = i_event\n", | |
" if not i_event % 1000: print(f\"{i_event:>6}/{self.n_events_used} events\\r\")\n", | |
" return col_df\n", | |
"\n", | |
"\n", | |
" def _process_header(self, str_header):\n", | |
" run_infos = str_header.split(self._run_tag)\n", | |
" header = run_infos.pop(0)\n", | |
" assert header.startswith(self._header_tag)\n", | |
" files = header[len(self._header_tag):].split(\"\\n\")\n", | |
" assert files.pop() == \"\"\n", | |
" assert len(files) == 1\n", | |
" for str_file in files:\n", | |
" self.file_name, str_file = str_file.strip().split(maxsplit=1)\n", | |
" str_file = str_file.strip()\n", | |
" assert str_file.startswith(\"[ \") and str_file.endswith(\" ]\")\n", | |
" self.n_runs, self.n_events = str_file[2:-2].split(\", \")\n", | |
" self.n_runs = int(remove_prefix(self.n_runs, \"number of runs: \"))\n", | |
" self.n_events = int(remove_prefix(self.n_events, \"number of events: \"))\n", | |
" run_infos[-1] = remove_suffix(run_infos[-1],\n", | |
" self._header_end_tag + \" \" + self.file_name + \"\\n\"\n", | |
" )\n", | |
" run_series = {}\n", | |
" for i, run_info in enumerate(run_infos):\n", | |
" run_series[i] = self._make_run(run_info)\n", | |
" return pd.concat(run_series, axis=1).transpose()\n", | |
"\n", | |
" def _make_run(self, str_run):\n", | |
" run_dict = {}\n", | |
" lines = str_run.split(\"\\n\")\n", | |
" assert lines.pop().strip().strip(\"-\") == \"\"\n", | |
" i_run, detector_tag = lines.pop(0).split(\" - \")\n", | |
" run_dict[\"RUN\"] = int(i_run)\n", | |
" run_dict[\"DETECTOR\"] = detector_tag.rstrip(\": \")\n", | |
" for line in lines:\n", | |
" key, val = line.split(\": \", maxsplit=1)\n", | |
" key = remove_prefix(key, \" parameter \")\n", | |
" key = remove_suffix(key, \" [string]\")\n", | |
" run_dict[key] = val.rstrip(\", \")\n", | |
" return pd.Series(run_dict)\n", | |
"\n", | |
"\n", | |
"aj = Anajob(raw_txt, max_events=-1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Run information" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"run_df = aj.run_df\n", | |
"run_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for k, v in run_df.transpose()[run_df.nunique().values == 1][0].items():\n", | |
" print(k, v)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"run_df.transpose()[run_df.nunique().values != 1].transpose()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for v in run_df.transpose()[run_df.nunique().values != 1].transpose().outputFile.values:\n", | |
" print(v)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Event information" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"aj.df.groupby(\"COLLECTION NAME\")[\"NUMBER OF ELEMENTS\"].mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"aj.df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"print(aj.event_header.nunique())\n", | |
"aj.event_header" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def plot_entries_per_collection(df):\n", | |
" per_collection = df.groupby(\"COLLECTION NAME\")[\"NUMBER OF ELEMENTS\"]\n", | |
" bins1 = np.arange(-0.5, 51, 1)\n", | |
" bins2 = np.arange(-0.5, 16)\n", | |
" bins3 = np.arange(-0.5, per_collection.max().max() + 0.5, 20)\n", | |
" fig, ax = plt.subplots(figsize=(12, 9))\n", | |
" axins1 = ax.inset_axes([0.30, 0.60, 0.69, 0.39], transform=ax.transAxes)\n", | |
" axins2 = ax.inset_axes([0.30, 0.17, 0.69, 0.39], transform=ax.transAxes)\n", | |
" kw = dict(histtype=\"step\", linewidth=2, density=True)\n", | |
" for i, collection in enumerate(per_collection.sum().sort_values(ascending=False).index):\n", | |
" x = df[df[\"COLLECTION NAME\"] == collection][\"NUMBER OF ELEMENTS\"]\n", | |
" kw[\"color\"] = list(mcolors.TABLEAU_COLORS)[i % 10]\n", | |
" kw[\"linestyle\"] = [\"-\", \":\", \"--\", \"-.\"][i // 10]\n", | |
" x.hist(bins=bins1, ax=ax, label=f\"{collection} ({x.mean():.1f})\", **kw)\n", | |
" x.hist(bins=bins2, ax=axins1, cumulative=-1, **kw)\n", | |
" x.hist(bins=bins3, ax=axins2, **kw)\n", | |
" axins1.patch.set_alpha(0.8)\n", | |
" axins2.patch.set_alpha(0.8)\n", | |
" ax.set_ylabel(\"pdf\")\n", | |
" axins1.set_ylabel(\"cdf\")\n", | |
" axins2.set_ylabel(\"pdf\")\n", | |
" axins2.set_yscale(\"log\")\n", | |
" ax.legend(title=\"COLLECTION NAME\", bbox_to_anchor=(1.0, 1.0))\n", | |
" fig.tight_layout()\n", | |
" return fig\n", | |
"\n", | |
"fig = plot_entries_per_collection(aj.df)\n", | |
"fig.savefig(\"collection_entries.png\", dpi=300, facefolor=\"white\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"interpreter": { | |
"hash": "da01a39d7be8ddd16a36886b47c621589cc5adc6987b4295cc334135f1184bcd" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3.8.3 ('bdt')", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.3" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Author
kunathj
commented
Feb 7, 2022
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment