Created
November 7, 2018 20:37
-
-
Save jambonrose/353925b1d1db86afcd273fb73b71ee5e to your computer and use it in GitHub Desktop.
Extract trip costs into CSV from IHG bill (partial HTML)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Parse IHG bill into CSVs for food and hotel | |
python -V | |
3.7.1 | |
pip install beautifulsoup4 lxml | |
pip install isort black flake8 flake8-docstrings pep8-naming flake8-bugbear | |
isort --builtin dataclasses extract.py | |
black -l 79 extract.py | |
flake8 --ignore=D202,D400,E266 extract.py | |
""" | |
from csv import DictWriter | |
from dataclasses import InitVar, asdict, dataclass, field, fields | |
from datetime import date | |
from itertools import groupby | |
from operator import attrgetter | |
from os.path import commonprefix | |
from bs4 import BeautifulSoup | |
### Constants ########################################################## | |
HOTEL_DESCRIPTIONS = { | |
"* Accommodation", | |
"Occupancy Tax", | |
"Business District Assessment", | |
"California Tourism Assessment", | |
} | |
### Utilities ########################################################## | |
def parse_date(string): | |
"""Parse date string on IHG site to Python date""" | |
month, day, year = map(int, string.split("/")) | |
return date(year=year, month=month, day=day) | |
def slugify(string): | |
"""Return field name from Cost for HotelTransaction""" | |
attempt = string.replace(" ", "_").lower() | |
if attempt == "*_accommodation": | |
return "value" | |
return attempt | |
def prettify_fieldname(string): | |
"""Transform Python variable into CSV column name""" | |
return string.replace("_", " ").title() | |
def describe(cost): | |
"""Return field name from Cost for FoodTransaction""" | |
description = cost.description.lower() | |
for desc in {"tax", "gratuity", "food"}: | |
if desc in description: | |
return desc | |
for desc in {"alcohol", "wine", "drink"}: | |
if desc in description: | |
return "alcohol" | |
for desc in {"food", "breakfast", "dinner"}: | |
if desc in description: | |
return "food" | |
raise Exception(f'unkown category for "{cost.description}"') | |
### Dataclasses ######################################################## | |
@dataclass(frozen=True) | |
class Cost: | |
"""Details for single cost""" | |
date: date = field(init=False) | |
date_str: InitVar[str] | |
description: str | |
value: str | |
def __post_init__(self, date_str): | |
"""Parse date string into date object""" | |
object.__setattr__(self, "date", parse_date(date_str)) | |
@dataclass | |
class HotelTransaction: | |
"""Full transaction for a single hotel night""" | |
date: date | |
value: str | |
occupancy_tax: str | |
business_district_assessment: str | |
california_tourism_assessment: str | |
@dataclass | |
class FoodTransaction: | |
"""Full transaction for a single meal""" | |
date: date | |
food: str | |
gratuity: str | |
location: str | |
tax: str | |
alcohol: str = 0 | |
### Cost & Transaction Manipulators #################################### | |
def parse_ihg_bill_to_costs(string): | |
"""Parse raw HTML string from IHG bill into list of Costs""" | |
html = BeautifulSoup(string, "lxml") | |
return [ | |
Cost(*[span.text for span in div.find_all("span")]) | |
for div in html.find_all(class_="col-xs-12")[3:] | |
] | |
def group_hotel_costs(costs): | |
"""Extract hotel costs and build list of transactions""" | |
hotel_costs = [ | |
cost for cost in costs if cost.description in HOTEL_DESCRIPTIONS | |
] | |
return [ | |
HotelTransaction( | |
**{slugify(cost.description): cost.value for cost in costs}, | |
date=cost_date, | |
) | |
for cost_date, costs in groupby(hotel_costs, attrgetter("date")) | |
] | |
def group_food_costs(costs): | |
"""Extract food costs and build list of transactions""" | |
def group_food(cost): | |
return (cost.date, cost.description.split(" ")[0]) | |
def create_name(names_list): | |
return f"{commonprefix(names_list).strip()} (Hotel)" | |
food_costs = [ | |
cost for cost in costs if cost.description not in HOTEL_DESCRIPTIONS | |
] | |
transactions = [] | |
for (cost_date, _), costs in groupby(food_costs, group_food): | |
food_described, kwargs, names = False, {"date": cost_date}, [] | |
for cost in costs: | |
category = describe(cost) | |
names.append(cost.description) | |
if category == "food": | |
if food_described: | |
kwargs["location"] = create_name(names) | |
transactions.append(FoodTransaction(**kwargs)) | |
food_described, names = False, [] | |
kwargs = {"date": cost_date, category: cost.value} | |
continue | |
else: | |
food_described = True | |
if not kwargs.get(category): | |
kwargs[category] = cost.value | |
else: | |
raise Exception("Cat. {category} already set for {cost}") | |
if kwargs and names: | |
kwargs["location"] = create_name(names) | |
transactions.append(FoodTransaction(**kwargs)) | |
return transactions | |
### File Manipulators ################################################## | |
def output_csv(dataklass, data, filename): | |
"""Output CSV file for arbitrary dataclasses""" | |
fieldnames = [field.name for field in fields(dataklass)] | |
with open(filename, mode="w") as csvfile: | |
writer = DictWriter(csvfile, fieldnames=fieldnames) | |
# writer.writeheader() | |
writer.writerow( | |
{field: prettify_fieldname(field) for field in fieldnames} | |
) | |
writer.writerows([asdict(row) for row in data]) | |
def main(): | |
"""Output CSVs for hotel and food costs from partial HTML IHG bill""" | |
with open("IHG_CA_Pearson_Partial_Bill.html") as fp: | |
costs = parse_ihg_bill_to_costs(fp.read()) | |
hotel_costs = group_hotel_costs(costs) | |
food_costs = group_food_costs(costs) | |
output_csv(HotelTransaction, hotel_costs, "hotel.csv") | |
output_csv(FoodTransaction, food_costs, "food.csv") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment