Last active
October 11, 2021 13:16
-
-
Save neuroid/a5e7b8a54a90ae2809cbf35716c127cd to your computer and use it in GitHub Desktop.
Parser for Bulb Energy (bulb.co.uk) statement PDFs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Parse Bulb Energy (bulb.co.uk) statement PDFs. | |
Requires the pdftotext command (part of poppler-utils). | |
""" | |
from collections import defaultdict | |
import csv | |
from datetime import datetime | |
from decimal import Decimal | |
import subprocess | |
import sys | |
def pdftotext(path): | |
process = subprocess.run(['pdftotext', '-layout', path, '-'], | |
capture_output=True, check=True, text=True) | |
return process.stdout | |
def parse(text): | |
lines = text.splitlines() | |
for i, line in enumerate(lines): | |
line = line.strip() | |
try: | |
next_line = lines[i + 1].strip() | |
except IndexError: | |
next_line = None | |
if line.startswith('For the period'): | |
year = next_line[:4] | |
year = ' ' + year if year.isdigit() else '' | |
period_start, period_end = map( | |
lambda date: datetime.strptime(date, '%d %B %Y').date(), | |
(line[15:].split(' ')[0] + year).split(' to ')) | |
elif line.startswith('Electricity use in detail'): | |
usage = defaultdict(Decimal, | |
period_start=period_start, | |
period_end=period_end, | |
fuel='electricity') | |
elif line.startswith('Gas use in detail'): | |
usage = defaultdict(Decimal, | |
period_start=period_start, | |
period_end=period_end, | |
fuel='gas') | |
elif line.startswith('Meter units used:'): | |
usage['meter_units'] = line[18:].split()[0] | |
elif (line.startswith('Energy ') | |
or line.startswith('Energy(1) ') | |
or line.startswith('Energy*') | |
or line.startswith('Energy*(1)')): | |
if 'p/kWh' in line: | |
_, kwh, _, _, kwh_price, _, _, cost, *_ = line.split() | |
else: | |
_, kwh, _, _, kwh_price, _, cost, *_ = line.split() | |
usage['kwh'] += Decimal(kwh) | |
usage['kwh_price'] = Decimal(kwh_price) / 100 | |
usage['cost'] += Decimal(cost) | |
elif (line.startswith('Standing charge ') | |
or line.startswith('Standing charge(1) ') | |
or line.startswith('Standing charge(1) ')): | |
usage['standing_charge'] += Decimal( | |
line.split(None, 8)[-1].split()[0]) | |
elif (line.startswith('Total electricity costs for this bill') | |
or line.startswith('Total gas costs for this bill')): | |
usage['total_cost'] = line.split('£')[1].split()[0] | |
yield usage | |
usage = None | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
sys.exit('Usage: {} statement.pdf [statement2.pdf]...'.format( | |
sys.argv[0])) | |
writer = csv.DictWriter(sys.stdout, ['period_start', 'period_end', 'fuel', | |
'meter_units', 'kwh_price', 'kwh', | |
'cost', 'standing_charge', | |
'total_cost']) | |
writer.writeheader() | |
for usage in sorted((usage for path in sys.argv[1:] | |
for usage in parse(pdftotext(path))), | |
key=lambda usage: (usage['period_start'], | |
usage['period_end'])): | |
writer.writerow(usage) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment