Skip to content

Instantly share code, notes, and snippets.

@jambonrose
Created November 7, 2018 20:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jambonrose/353925b1d1db86afcd273fb73b71ee5e to your computer and use it in GitHub Desktop.
Save jambonrose/353925b1d1db86afcd273fb73b71ee5e to your computer and use it in GitHub Desktop.
Extract trip costs into CSV from IHG bill (partial HTML)
"""Parse IHG bill into CSVs for food and hotel
python -V
3.7.1
pip install beautifulsoup4 lxml
pip install isort black flake8 flake8-docstrings pep8-naming flake8-bugbear
isort --builtin dataclasses extract.py
black -l 79 extract.py
flake8 --ignore=D202,D400,E266 extract.py
"""
from csv import DictWriter
from dataclasses import InitVar, asdict, dataclass, field, fields
from datetime import date
from itertools import groupby
from operator import attrgetter
from os.path import commonprefix
from bs4 import BeautifulSoup
### Constants ##########################################################
HOTEL_DESCRIPTIONS = {
"* Accommodation",
"Occupancy Tax",
"Business District Assessment",
"California Tourism Assessment",
}
### Utilities ##########################################################
def parse_date(string):
"""Parse date string on IHG site to Python date"""
month, day, year = map(int, string.split("/"))
return date(year=year, month=month, day=day)
def slugify(string):
"""Return field name from Cost for HotelTransaction"""
attempt = string.replace(" ", "_").lower()
if attempt == "*_accommodation":
return "value"
return attempt
def prettify_fieldname(string):
"""Transform Python variable into CSV column name"""
return string.replace("_", " ").title()
def describe(cost):
"""Return field name from Cost for FoodTransaction"""
description = cost.description.lower()
for desc in {"tax", "gratuity", "food"}:
if desc in description:
return desc
for desc in {"alcohol", "wine", "drink"}:
if desc in description:
return "alcohol"
for desc in {"food", "breakfast", "dinner"}:
if desc in description:
return "food"
raise Exception(f'unkown category for "{cost.description}"')
### Dataclasses ########################################################
@dataclass(frozen=True)
class Cost:
"""Details for single cost"""
date: date = field(init=False)
date_str: InitVar[str]
description: str
value: str
def __post_init__(self, date_str):
"""Parse date string into date object"""
object.__setattr__(self, "date", parse_date(date_str))
@dataclass
class HotelTransaction:
"""Full transaction for a single hotel night"""
date: date
value: str
occupancy_tax: str
business_district_assessment: str
california_tourism_assessment: str
@dataclass
class FoodTransaction:
"""Full transaction for a single meal"""
date: date
food: str
gratuity: str
location: str
tax: str
alcohol: str = 0
### Cost & Transaction Manipulators ####################################
def parse_ihg_bill_to_costs(string):
"""Parse raw HTML string from IHG bill into list of Costs"""
html = BeautifulSoup(string, "lxml")
return [
Cost(*[span.text for span in div.find_all("span")])
for div in html.find_all(class_="col-xs-12")[3:]
]
def group_hotel_costs(costs):
"""Extract hotel costs and build list of transactions"""
hotel_costs = [
cost for cost in costs if cost.description in HOTEL_DESCRIPTIONS
]
return [
HotelTransaction(
**{slugify(cost.description): cost.value for cost in costs},
date=cost_date,
)
for cost_date, costs in groupby(hotel_costs, attrgetter("date"))
]
def group_food_costs(costs):
"""Extract food costs and build list of transactions"""
def group_food(cost):
return (cost.date, cost.description.split(" ")[0])
def create_name(names_list):
return f"{commonprefix(names_list).strip()} (Hotel)"
food_costs = [
cost for cost in costs if cost.description not in HOTEL_DESCRIPTIONS
]
transactions = []
for (cost_date, _), costs in groupby(food_costs, group_food):
food_described, kwargs, names = False, {"date": cost_date}, []
for cost in costs:
category = describe(cost)
names.append(cost.description)
if category == "food":
if food_described:
kwargs["location"] = create_name(names)
transactions.append(FoodTransaction(**kwargs))
food_described, names = False, []
kwargs = {"date": cost_date, category: cost.value}
continue
else:
food_described = True
if not kwargs.get(category):
kwargs[category] = cost.value
else:
raise Exception("Cat. {category} already set for {cost}")
if kwargs and names:
kwargs["location"] = create_name(names)
transactions.append(FoodTransaction(**kwargs))
return transactions
### File Manipulators ##################################################
def output_csv(dataklass, data, filename):
"""Output CSV file for arbitrary dataclasses"""
fieldnames = [field.name for field in fields(dataklass)]
with open(filename, mode="w") as csvfile:
writer = DictWriter(csvfile, fieldnames=fieldnames)
# writer.writeheader()
writer.writerow(
{field: prettify_fieldname(field) for field in fieldnames}
)
writer.writerows([asdict(row) for row in data])
def main():
"""Output CSVs for hotel and food costs from partial HTML IHG bill"""
with open("IHG_CA_Pearson_Partial_Bill.html") as fp:
costs = parse_ihg_bill_to_costs(fp.read())
hotel_costs = group_hotel_costs(costs)
food_costs = group_food_costs(costs)
output_csv(HotelTransaction, hotel_costs, "hotel.csv")
output_csv(FoodTransaction, food_costs, "food.csv")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment