Last active
August 2, 2018 02:40
-
-
Save ms8r/964b4478e0e882671976b407fff5ce27 to your computer and use it in GitHub Desktop.
Script to extract sales data from Audible monthly report XLS files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
archive | |
*.xls | |
*.ods | |
*.tsv | |
*~ | |
*.pyc | |
*.ipynb | |
.ipynb_checkpoints |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Data extractor for Audible monthly sales report files. | |
Requires name of XLS file with report data as input, writes tab separated data | |
(incl. report year and month as additional columns) without subtotals, empty | |
lines, etc. to stdout. | |
""" | |
import sys | |
from datetime import datetime as dt | |
import argparse | |
import logging | |
import pandas as pd | |
# column headers; columns with empty strings will be ommitted from import | |
COLS = [ | |
'', | |
'ID', | |
'Title', | |
'Author', | |
'Market', | |
'Royalty_Share', | |
'ALC_Qty', | |
'ALC_Sales', | |
'ALC_Royalty', | |
'AL_Qty', | |
'AL_Sales', | |
'AL_Royalty', | |
'ALOP_Qty', | |
'ALOP_Sales', | |
'ALOP_Royalty', | |
] | |
# columns that will later be moved to index: | |
index_cols = COLS[1:6] | |
# row and col index for reporting month in xls | |
REP_MONTH_CELL = (0, 9) | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | |
level=logging.INFO) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument('xls', help="""input xls file""") | |
parser.add_argument('--header', action='store_true', help="""if specified | |
header row with column labels will be printed""") | |
args = parser.parse_args() | |
usecols = [i for i, c in enumerate(COLS) if c != ''] | |
names = [c for c in COLS if c != ''] | |
# extract reporting period | |
row, col = REP_MONTH_CELL | |
rep_month = pd.read_excel(args.xls, 'Sales Details', | |
header=None).iloc[row, col] | |
logging.info('found reporting period "%s"', rep_month) | |
rep_date = dt.strptime(rep_month, '%b-%Y') | |
# read data only | |
df = pd.read_excel(args.xls, 'Sales Details', header=None, | |
skiprows=4, skipfooter=3, | |
usecols=usecols, names=names) | |
logging.info('read %d lines from %s', df.shape[0], args.xls) | |
# get rid of (sub-)totals and empty lines | |
df = df[~df['ID'].str.contains('Total', na=True)] | |
logging.info('retained %d data records', df.shape[0]) | |
# prepend reporting year and month | |
df['Year'] = rep_date.year | |
df['Month'] = rep_date.month | |
df.set_index(['Year', 'Month'] + index_cols, drop=True, inplace=True, | |
verify_integrity=True) | |
df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in | |
df.columns]) | |
header_printed = False | |
for category in df.columns.levels[0]: | |
out = df[category] | |
# next assignment raises false positive on pandas view vs copy warning | |
pd.options.mode.chained_assignment = None | |
out['Category'] = category | |
pd.options.mode.chained_assignment = 'warn' | |
out.set_index('Category', append=True, inplace=True) | |
out = out.reorder_levels(['Year', 'Month', 'Category'] + index_cols) | |
if args.header and not header_printed: | |
header = True | |
header_printed = True | |
else: | |
header = False | |
out.to_csv(sys.stdout, sep='\t', header=header) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment