Skip to content

Instantly share code, notes, and snippets.

@ms8r
Last active August 2, 2018 02:40
Show Gist options
  • Save ms8r/964b4478e0e882671976b407fff5ce27 to your computer and use it in GitHub Desktop.
Save ms8r/964b4478e0e882671976b407fff5ce27 to your computer and use it in GitHub Desktop.
Script to extract sales data from Audible monthly report XLS files
archive
*.xls
*.ods
*.tsv
*~
*.pyc
*.ipynb
.ipynb_checkpoints
#!/usr/bin/env python
"""
Data extractor for Audible monthly sales report files.
Requires name of XLS file with report data as input, writes tab separated data
(incl. report year and month as additional columns) without subtotals, empty
lines, etc. to stdout.
"""
import sys
from datetime import datetime as dt
import argparse
import logging
import pandas as pd
# column headers; columns with empty strings will be ommitted from import
COLS = [
'',
'ID',
'Title',
'Author',
'Market',
'Royalty_Share',
'ALC_Qty',
'ALC_Sales',
'ALC_Royalty',
'AL_Qty',
'AL_Sales',
'AL_Royalty',
'ALOP_Qty',
'ALOP_Sales',
'ALOP_Royalty',
]
# columns that will later be moved to index:
index_cols = COLS[1:6]
# row and col index for reporting month in xls
REP_MONTH_CELL = (0, 9)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('xls', help="""input xls file""")
parser.add_argument('--header', action='store_true', help="""if specified
header row with column labels will be printed""")
args = parser.parse_args()
usecols = [i for i, c in enumerate(COLS) if c != '']
names = [c for c in COLS if c != '']
# extract reporting period
row, col = REP_MONTH_CELL
rep_month = pd.read_excel(args.xls, 'Sales Details',
header=None).iloc[row, col]
logging.info('found reporting period "%s"', rep_month)
rep_date = dt.strptime(rep_month, '%b-%Y')
# read data only
df = pd.read_excel(args.xls, 'Sales Details', header=None,
skiprows=4, skipfooter=3,
usecols=usecols, names=names)
logging.info('read %d lines from %s', df.shape[0], args.xls)
# get rid of (sub-)totals and empty lines
df = df[~df['ID'].str.contains('Total', na=True)]
logging.info('retained %d data records', df.shape[0])
# prepend reporting year and month
df['Year'] = rep_date.year
df['Month'] = rep_date.month
df.set_index(['Year', 'Month'] + index_cols, drop=True, inplace=True,
verify_integrity=True)
df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in
df.columns])
header_printed = False
for category in df.columns.levels[0]:
out = df[category]
# next assignment raises false positive on pandas view vs copy warning
pd.options.mode.chained_assignment = None
out['Category'] = category
pd.options.mode.chained_assignment = 'warn'
out.set_index('Category', append=True, inplace=True)
out = out.reorder_levels(['Year', 'Month', 'Category'] + index_cols)
if args.header and not header_printed:
header = True
header_printed = True
else:
header = False
out.to_csv(sys.stdout, sep='\t', header=header)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment