Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
dist_extract.py
#!/usr/bin/env python3
import os
import glob
import argparse
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from btrdb.utils.timez import *
DATA = "data/*.csv.gz"
def load_data(path=DATA, limit=None, deltas=False):
for idx, fpath in enumerate(glob.glob(path)):
if limit is not None and idx >= limit:
break
name = os.path.basename(fpath).rstrip(".csv.gz")
df = pd.read_csv(fpath, compression="gzip")
if deltas:
yield name, df.diff()
else:
yield name, df
def counts(args):
dfc = None
for fname, df in load_data(limit=args.limit):
count = df.count()
names = pd.Series([fname]*len(count), index=count.index)
count = pd.concat([count, names], axis=1)
count.columns = ["count", "device"]
if dfc is None:
dfc = count
else:
dfc = pd.concat([dfc, count], axis=0)
dfc.to_csv(args.outpath, header=False)
def dist(args):
dfd = None
for fname, df in load_data(limit=args.limit, deltas=True):
if dfd is None:
dfd = df.describe().transpose()
else:
dfd = pd.concat([dfd, df.describe().transpose()], axis=0)
dfd.to_csv(args.outpath, header=True)
def deltas(args):
dfd = None
for fname, df in load_data(limit=args.limit, deltas=True):
for col in df.columns:
s = df[col]
names = pd.Series([fname]*len(s), index=s.index)
s = pd.concat([s, names], axis=1)
if dfd is None:
dfd = s
else:
dfd = pd.concat([dfd, s], axis=0)
dfd.to_csv(args.outpath, header=True)
def main(args):
raise Exception("please specify a command")
if __name__ == "__main__":
# Global arguments
args = {
("-l", "--limit"): {
"type": int, "metavar": "N", "default": None,
"help": "limit the number of datasets read"
},
}
# Sub commands
cmds = {
"count": {
"func": counts,
("-o", "--outpath"): {
"type": str, "metavar": "FILE", "default": "counts.csv.gz",
"help": "location to write the counts output file",
},
},
"dist": {
"func": dist,
("-o", "--outpath"): {
"type": str, "metavar": "FILE", "default": "delta_dist.csv.gz",
"help": "location to write the counts output file",
},
},
"deltas": {
"func": deltas,
("-o", "--outpath"): {
"type": str, "metavar": "FILE", "default": "deltas.csv.gz",
"help": "location to write the counts output file",
},
},
}
parser = argparse.ArgumentParser(description="wrangle timestamps for jitter analysis")
parser.set_defaults(func=main)
subparsers = parser.add_subparsers(help="wrangling commands")
for pargs, kwargs in args.items():
if isinstance(pargs, str):
pargs = (pargs,)
parser.add_argument(*pargs, **kwargs)
for cmd, cargs in cmds.items():
sp = subparsers.add_parser(cmd)
sp.set_defaults(func=cargs.pop("func"))
for pargs, kwargs in cargs.items():
if isinstance(pargs, str):
pargs = (pargs,)
sp.add_argument(*pargs, **kwargs)
args = parser.parse_args()
try:
args.func(args)
except Exception as e:
parser.error(str(e))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment