oisin/swusage.py

## swusage.py
#!/usr/bin/env python
#
# usage: swusage.py [-h] [-k KPI] [-o FILE] [-v] [-c COMPANY] [-gt NUMBER]
#
# Create per-app CSV timeseries for a single KPI from Dashboard usage reports.
#
# optional arguments:
#   -h, --help  show this help message and exit
#   -k KPI      Name of column/kpi in usage report to be put in timeseries, this
#               defaults to month_cmu
#   -o FILE     Name of output CSV file, this defaults to timeseries.csv
#   -v          Run in verbose mode
#   -c COMPANY  Only process entries for this company id
#   -gt NUMBER  Only keep entries that are greater than this number value, e.g.
#               -k month_cmu -gt 0 will only keep calendar monthly active user
#               entries if there is more than 0 MAUs

import pandas as pd
import numpy as np
import argparse as args
import re

# def bloop(arg):
#     print("app is " + str(int(arg['id'])))
#     arg = arg.iloc[1:]
#     meep = arg.apply(np.mean)
#     print("meep is (" + str(meep) + ")")

p = args.ArgumentParser(description='Create per-app CSV timeseries for a single KPI from Dashboard usage reports.')
p.add_argument('-k', metavar="KPI", type=str, help="Name of column/kpi in usage report to be put in timeseries, this defaults to month_cmu", default="month_cmu")
p.add_argument('-o', metavar="FILE", type=str, help="Name of output CSV file, this defaults to timeseries.csv", default="timeseries.csv")
p.add_argument('-v', action='store_true', help="Run in verbose mode")
p.add_argument('-c', metavar="COMPANY", type=int, help="Only process entries for this company id")
p.add_argument('-s', metavar="STATE", type=str, help="Only process entries for apps of this state (Live, Dead, Retired, Sandbox)",default='Live')
p.add_argument('-gt', metavar="NUMBER", type=int, help="Only keep entries that are greater than this number value, e.g. -k month_cmu -gt 0 will only keep calendar monthly active user entries if there is more than 0 MAUs", default=0)


args,filenames = p.parse_known_args()

if (args.gt < 0):
    print("-gt argument must be a positive value")
    quit()

fulldf = pd.DataFrame()
column_names = []
csvs_processed = []

for csv in filenames:
    colname = re.search(r'(\d+-\d+)', csv)
    if (colname == None):
        print("Ignoring " + csv + " - it does not have YYYY-MM date form in the name")
        continue

    column_names.append(colname.group(1))
    if (args.v):
        print("Reading csv " + csv)

    try:
        if (csv not in csvs_processed):
            fdf = pd.read_csv(csv, usecols=['id', args.k, 'company_id', 'name', 'company_name', 'type', 'tier'])
            csvs_processed.append(csv)
        else:
            if (args.v):
                print(csv + " has already been processed, skipping")
            continue
    except:
        print("Cannot open " + csv)
        continue

    if (args.v):
        print("Renaming " + args.k + " column to " + colname.group(1))

    fdf.rename(columns={args.k: colname.group(1)}, inplace=True)

    if (args.v):3
        print("Filtering out non-Live apps")

    current_df = fdf[(fdf['type'] == args.s) & (fdf['tier'] != 'Swrve') &
             (fdf['tier'] != 'Swrve Admin') & (fdf['company_id'] != 450) &
             (fdf['tier'] != 'Churned')
         ]

    if (args.c):
        if (args.v):
            print("Filtering on company " + str(args.c))
        current_df = current_df[(fdf['company_id'] == args.c)]
        if (current_df.empty):
            print("Company id " + str(args.c) + " is not present in " + csv)
            quit()

    if (fulldf.empty):
        fulldf = current_df
    else:
        fulldf = fulldf.join(current_df[['id',colname.group(1)]].set_index('id'), on='id')

# Produce output file, with NaN entries removed and non-'id' columns
# ordered alphabetically
#
if (fulldf.empty):
    print("No output produced -- did you add filenames to the command?")
else:
    # # Filter out data that we don't need. All NaNs go to 0 for cleanliness
    # # Rows where *all* of the entries for an app are less than the -gt specified value
    # # get dropped. We ignore negative values in this option
    fulldf = fulldf.fillna(0)

    # if (args.gt > 0):
    #     if (args.v):
    #         print("Filtering " + args.k + " values that are less than or equal to " + str(args.gt))
    #     fulldf.apply(bloop, axis=1)

    column_names.sort()
    fulldf = fulldf.reindex(columns=['id', 'name', 'company_id', 'company_name', 'tier', 'type'] + column_names)

    if (args.v):
        print("Writing output to " + args.o)

    fulldf.to_csv(args.o, encoding='utf-8', index=False)
	#!/usr/bin/env python
	#
	# usage: swusage.py [-h] [-k KPI] [-o FILE] [-v] [-c COMPANY] [-gt NUMBER]
	#
	# Create per-app CSV timeseries for a single KPI from Dashboard usage reports.
	#
	# optional arguments:
	# -h, --help show this help message and exit
	# -k KPI Name of column/kpi in usage report to be put in timeseries, this
	# defaults to month_cmu
	# -o FILE Name of output CSV file, this defaults to timeseries.csv
	# -v Run in verbose mode
	# -c COMPANY Only process entries for this company id
	# -gt NUMBER Only keep entries that are greater than this number value, e.g.
	# -k month_cmu -gt 0 will only keep calendar monthly active user
	# entries if there is more than 0 MAUs

	import pandas as pd
	import numpy as np
	import argparse as args
	import re

	# def bloop(arg):
	# print("app is " + str(int(arg['id'])))
	# arg = arg.iloc[1:]
	# meep = arg.apply(np.mean)
	# print("meep is (" + str(meep) + ")")

	p = args.ArgumentParser(description='Create per-app CSV timeseries for a single KPI from Dashboard usage reports.')
	p.add_argument('-k', metavar="KPI", type=str, help="Name of column/kpi in usage report to be put in timeseries, this defaults to month_cmu", default="month_cmu")
	p.add_argument('-o', metavar="FILE", type=str, help="Name of output CSV file, this defaults to timeseries.csv", default="timeseries.csv")
	p.add_argument('-v', action='store_true', help="Run in verbose mode")
	p.add_argument('-c', metavar="COMPANY", type=int, help="Only process entries for this company id")
	p.add_argument('-s', metavar="STATE", type=str, help="Only process entries for apps of this state (Live, Dead, Retired, Sandbox)",default='Live')
	p.add_argument('-gt', metavar="NUMBER", type=int, help="Only keep entries that are greater than this number value, e.g. -k month_cmu -gt 0 will only keep calendar monthly active user entries if there is more than 0 MAUs", default=0)


	args,filenames = p.parse_known_args()

	if (args.gt < 0):
	print("-gt argument must be a positive value")
	quit()

	fulldf = pd.DataFrame()
	column_names = []
	csvs_processed = []

	for csv in filenames:
	colname = re.search(r'(\d+-\d+)', csv)
	if (colname == None):
	print("Ignoring " + csv + " - it does not have YYYY-MM date form in the name")
	continue

	column_names.append(colname.group(1))
	if (args.v):
	print("Reading csv " + csv)

	try:
	if (csv not in csvs_processed):
	fdf = pd.read_csv(csv, usecols=['id', args.k, 'company_id', 'name', 'company_name', 'type', 'tier'])
	csvs_processed.append(csv)
	else:
	if (args.v):
	print(csv + " has already been processed, skipping")
	continue
	except:
	print("Cannot open " + csv)
	continue

	if (args.v):
	print("Renaming " + args.k + " column to " + colname.group(1))

	fdf.rename(columns={args.k: colname.group(1)}, inplace=True)

	if (args.v):3
	print("Filtering out non-Live apps")

	current_df = fdf[(fdf['type'] == args.s) & (fdf['tier'] != 'Swrve') &
	(fdf['tier'] != 'Swrve Admin') & (fdf['company_id'] != 450) &
	(fdf['tier'] != 'Churned')
	]

	if (args.c):
	if (args.v):
	print("Filtering on company " + str(args.c))
	current_df = current_df[(fdf['company_id'] == args.c)]
	if (current_df.empty):
	print("Company id " + str(args.c) + " is not present in " + csv)
	quit()

	if (fulldf.empty):
	fulldf = current_df
	else:
	fulldf = fulldf.join(current_df[['id',colname.group(1)]].set_index('id'), on='id')

	# Produce output file, with NaN entries removed and non-'id' columns
	# ordered alphabetically
	#
	if (fulldf.empty):
	print("No output produced -- did you add filenames to the command?")
	else:
	# # Filter out data that we don't need. All NaNs go to 0 for cleanliness
	# # Rows where all of the entries for an app are less than the -gt specified value
	# # get dropped. We ignore negative values in this option
	fulldf = fulldf.fillna(0)

	# if (args.gt > 0):
	# if (args.v):
	# print("Filtering " + args.k + " values that are less than or equal to " + str(args.gt))
	# fulldf.apply(bloop, axis=1)

	column_names.sort()
	fulldf = fulldf.reindex(columns=['id', 'name', 'company_id', 'company_name', 'tier', 'type'] + column_names)

	if (args.v):
	print("Writing output to " + args.o)

	fulldf.to_csv(args.o, encoding='utf-8', index=False)