Last active
July 27, 2020 20:01
-
-
Save lelandbatey/58330f13a02e7b5a0af179d5dee9262b to your computer and use it in GitHub Desktop.
A tool for viewing streams of timestamps as histograms. Uses Matplotlib and Pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
A tool for viewing streams of timestamps as histograms. | |
''' | |
import matplotlib.pyplot as plt | |
from datetime import datetime | |
import matplotlib.dates as mdates | |
import pandas as pd | |
import itertools | |
import argparse | |
import sys | |
# Taken from here: https://stackoverflow.com/a/53995225 | |
def line_format(label): | |
""" | |
Convert time label to the format of pandas line plot | |
""" | |
month = label.month_name()[:3] | |
if month == 'Jan': | |
month += f'\n{label.year}' | |
return month | |
def dataframify_datetimes(dt_sequences): | |
all_ts = itertools.chain.from_iterable(dt_sequences) | |
all_ts = sorted(list(set(all_ts))) | |
count_lists = list() | |
for tsl in dt_sequences: | |
x = {y: 0 for y in all_ts} | |
for ts in tsl: | |
x[ts] += 1 | |
cl = sorted([(k, v) for k, v in x.items()], key=lambda z: z[0]) | |
count_lists.append([w[1] for w in cl]) | |
assert len(cl) == len(all_ts) | |
return all_ts, count_lists | |
def read_ts_file(f, parsefunc): | |
data = f.read() | |
return [parsefunc(x) for x in data.split('\n') if x.strip()] | |
def make_fallback_funcs(funcs): | |
def try_each(ts): | |
for f in funcs: | |
try: | |
return f(ts) | |
except Exception as e: | |
continue | |
raise ValueError(f"No parse functions could successfully parse value {ts}") | |
return try_each | |
def main(): | |
parser = argparse.ArgumentParser(description="View streams of epoch millisecond timestamps as histograms") | |
parser.add_argument( | |
'files', | |
metavar="FILES", | |
help="files with epoch millisecond timestamps. If ommited, reads from stdin.", | |
nargs="?", | |
default=None | |
) | |
parser.add_argument( | |
'--bin-size', | |
help= | |
"How to divide the data for viewing: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases", | |
default="1D" | |
) | |
parser.add_argument( | |
'--image', help="Saves the generated graph as an image instead of showing an interactive window.", default=None | |
) | |
parser.add_argument( | |
'--format', | |
help="The datetime format to use when parsing each line of " | |
"the input file. May be either a strptime() compatible format such as '%%Y-%%m-%%d %%H:%%M:%%S.%%f' or the " | |
"string EPOCHMS which causes each line to be parsed as an integer representing the " | |
"number of milliseconds since the UNIX epoch at UTC. Default is EPOCHMS.", | |
action='append', | |
default=["EPOCHMS"] | |
) | |
args = parser.parse_args() | |
def make_parsets(fmt): | |
if fmt == "EPOCHMS": | |
pf = lambda x: datetime.fromtimestamp(int(x.strip()) / 1000) | |
else: | |
pf = lambda x: datetime.strptime(x.strip(), fmt) | |
return pf | |
parsefuncs = [make_parsets(fmt) for fmt in args.format] | |
parsefunc = make_fallback_funcs(parsefuncs) | |
datetime_sequences = list() | |
if args.files: | |
for fn in args.files: | |
with open(fn) as f: | |
datetime_sequences.append(read_ts_file(f, parsefunc)) | |
else: | |
data = sys.stdin.read() | |
timestamps = [parsefunc(x) for x in data.split('\n') if x.strip()] | |
datetime_sequences = [timestamps] | |
index, columns = dataframify_datetimes(datetime_sequences) | |
df = pd.DataFrame(zip(*columns), index=index) | |
df = df.resample(args.bin_size).sum() | |
ax = df.plot(kind='line') | |
plt.tight_layout() | |
if args.image: | |
fig = ax.get_figure() | |
fig.savefig(args.image, dpi=2000) | |
else: | |
plt.show() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment