Merge multiple usage CSV files to produce per-app MAU timelines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# usage: swusage.py [-h] [-k KPI] [-o FILE] [-v] [-c COMPANY] [-gt NUMBER] | |
# | |
# Create per-app CSV timeseries for a single KPI from Dashboard usage reports. | |
# | |
# optional arguments: | |
# -h, --help show this help message and exit | |
# -k KPI Name of column/kpi in usage report to be put in timeseries, this | |
# defaults to month_cmu | |
# -o FILE Name of output CSV file, this defaults to timeseries.csv | |
# -v Run in verbose mode | |
# -c COMPANY Only process entries for this company id | |
# -gt NUMBER Only keep entries that are greater than this number value, e.g. | |
# -k month_cmu -gt 0 will only keep calendar monthly active user | |
# entries if there is more than 0 MAUs | |
import pandas as pd | |
import numpy as np | |
import argparse as args | |
import re | |
# def bloop(arg): | |
# print("app is " + str(int(arg['id']))) | |
# arg = arg.iloc[1:] | |
# meep = arg.apply(np.mean) | |
# print("meep is (" + str(meep) + ")") | |
p = args.ArgumentParser(description='Create per-app CSV timeseries for a single KPI from Dashboard usage reports.') | |
p.add_argument('-k', metavar="KPI", type=str, help="Name of column/kpi in usage report to be put in timeseries, this defaults to month_cmu", default="month_cmu") | |
p.add_argument('-o', metavar="FILE", type=str, help="Name of output CSV file, this defaults to timeseries.csv", default="timeseries.csv") | |
p.add_argument('-v', action='store_true', help="Run in verbose mode") | |
p.add_argument('-c', metavar="COMPANY", type=int, help="Only process entries for this company id") | |
p.add_argument('-s', metavar="STATE", type=str, help="Only process entries for apps of this state (Live, Dead, Retired, Sandbox)",default='Live') | |
p.add_argument('-gt', metavar="NUMBER", type=int, help="Only keep entries that are greater than this number value, e.g. -k month_cmu -gt 0 will only keep calendar monthly active user entries if there is more than 0 MAUs", default=0) | |
args,filenames = p.parse_known_args() | |
if (args.gt < 0): | |
print("-gt argument must be a positive value") | |
quit() | |
fulldf = pd.DataFrame() | |
column_names = [] | |
csvs_processed = [] | |
for csv in filenames: | |
colname = re.search(r'(\d+-\d+)', csv) | |
if (colname == None): | |
print("Ignoring " + csv + " - it does not have YYYY-MM date form in the name") | |
continue | |
column_names.append(colname.group(1)) | |
if (args.v): | |
print("Reading csv " + csv) | |
try: | |
if (csv not in csvs_processed): | |
fdf = pd.read_csv(csv, usecols=['id', args.k, 'company_id', 'name', 'company_name', 'type', 'tier']) | |
csvs_processed.append(csv) | |
else: | |
if (args.v): | |
print(csv + " has already been processed, skipping") | |
continue | |
except: | |
print("Cannot open " + csv) | |
continue | |
if (args.v): | |
print("Renaming " + args.k + " column to " + colname.group(1)) | |
fdf.rename(columns={args.k: colname.group(1)}, inplace=True) | |
if (args.v):3 | |
print("Filtering out non-Live apps") | |
current_df = fdf[(fdf['type'] == args.s) & (fdf['tier'] != 'Swrve') & | |
(fdf['tier'] != 'Swrve Admin') & (fdf['company_id'] != 450) & | |
(fdf['tier'] != 'Churned') | |
] | |
if (args.c): | |
if (args.v): | |
print("Filtering on company " + str(args.c)) | |
current_df = current_df[(fdf['company_id'] == args.c)] | |
if (current_df.empty): | |
print("Company id " + str(args.c) + " is not present in " + csv) | |
quit() | |
if (fulldf.empty): | |
fulldf = current_df | |
else: | |
fulldf = fulldf.join(current_df[['id',colname.group(1)]].set_index('id'), on='id') | |
# Produce output file, with NaN entries removed and non-'id' columns | |
# ordered alphabetically | |
# | |
if (fulldf.empty): | |
print("No output produced -- did you add filenames to the command?") | |
else: | |
# # Filter out data that we don't need. All NaNs go to 0 for cleanliness | |
# # Rows where *all* of the entries for an app are less than the -gt specified value | |
# # get dropped. We ignore negative values in this option | |
fulldf = fulldf.fillna(0) | |
# if (args.gt > 0): | |
# if (args.v): | |
# print("Filtering " + args.k + " values that are less than or equal to " + str(args.gt)) | |
# fulldf.apply(bloop, axis=1) | |
column_names.sort() | |
fulldf = fulldf.reindex(columns=['id', 'name', 'company_id', 'company_name', 'tier', 'type'] + column_names) | |
if (args.v): | |
print("Writing output to " + args.o) | |
fulldf.to_csv(args.o, encoding='utf-8', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Got the joins working at last, what a torment. Next step here is to exclude all organizations that have less than a mean of N DAU -- this will require the computation of a spare column in the final version of the sheet using np.mean, filtering based on that number and then removal of the column for clean presentation.