Last active
October 17, 2022 01:54
-
-
Save sjmf/e1c547f0cb2cefcb814f5696b38abbe1 to your computer and use it in GitHub Desktop.
Convert a PDF receipt from Sainsbury's website to CSV by parsing the PDF into columns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pdfplumber>=0.5.28 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# | |
# Sainsburys PDF receipt to CSV python script | |
# Samantha Finnigan 2021 | |
# | |
# TODO: | |
# * Sum everything in column 3 of the data and check if the total matches the PDF-reported subtotal via assert() | |
# * Support dragging files onto the script: if arguments exist and they are files, process those instead of globbing | |
# * Investigate/Support 'Including Savings' per 536078218 | |
# | |
import sys | |
import re | |
import csv | |
from glob import glob | |
import pdfplumber | |
from pprint import pprint | |
# Read the PDF into plaintext | |
def read_in(file): | |
text = '' | |
with pdfplumber.open(file) as pdf: | |
for page in pdf.pages: | |
text += '\n' + page.extract_text() | |
return text | |
# Find subtotal | |
def get_subtotal(text): | |
r = re.compile(r'(Subtotal \(\d+ .+\)) £(\d{1,2}.\d{1,2})') | |
return r.search(text).group(2) | |
# Run main processing of plaintext using regexes | |
def process_plaintext(text): | |
summary = text.split('\n') | |
# First stage removes header (everything before 'Delivery summary' line) | |
# (this can include things that match the second regex!) | |
r = re.compile(r'^(Delivery summary \((\d+) items\))$') | |
while r.match(summary[0]) is None: | |
summary.pop(0) | |
# Get count of number of items from summary | |
count_items = int(r.match(summary.pop(0)).group(2)) | |
# Next, trash everything after "Thanks for shopping with us" | |
idx = summary.index("Thanks for shopping with us") | |
assert(idx > 0) | |
summary = summary[:idx] | |
# Separate everything after 'Order Summary' into a separate list | |
r2 = re.compile(r'^Order summary') | |
idx = 0 | |
while r2.match(summary[idx]) is None: | |
# print(idx, '\n', summary[idx]) | |
idx += 1 | |
meta = summary[idx:] | |
summary = summary[:idx] | |
# Final stage filters the list by regex: | |
r3 = re.compile(r'([\d\.]+kg|\d) ([^£]+) (£\d{1,2}.\d{1,2})') | |
data = r3.findall('\n'.join(summary)) | |
# Remove \n from the data (join up lines where product name is split) | |
data = [(i[0], i[1].replace('\n', ' '), i[2]) for i in data] | |
return meta, data, count_items | |
# Append delivery cost and coupon discount total | |
def append_delivery_coupons(meta, data): | |
# Get delivery cost and add it to the data | |
r = re.compile(r'Delivery cost (£\d{1,2}.\d{1,2})') | |
delivery = r.findall('\n'.join(meta))[0] | |
print('Delivery cost: {}'.format(delivery)) | |
data.append(('', 'Delivery', delivery)) | |
# Get coupon discounts and add those to the data | |
try: | |
r = re.compile(r'Coupons (\-£\d{1,2}.\d{1,2})') | |
coupons = r.findall('\n'.join(meta))[0] | |
print('Coupon savings: {}'.format(coupons)) | |
data.append(('', 'Coupons', coupons)) | |
except IndexError as e: | |
print('No coupons found') | |
return data | |
# Write out data as csv | |
def write_out(file, data): | |
with open(file.split('.')[0] + '.csv','w', newline='', encoding='UTF-8') as out: | |
csv_out = csv.writer(out) | |
csv_out.writerow(['Qty','Name','Cost']) | |
for row in data: | |
try: | |
csv_out.writerow(row) | |
except UnicodeEncodeError as e: | |
print(e, row) | |
# Main | |
if __name__ == "__main__": | |
# If args not empty list | |
if sys.argv[1:]: | |
pdfs = sys.argv[1:] | |
else: | |
pdfs = glob("*.pdf") | |
# Iterate over pdfs | |
for file in pdfs: | |
text = read_in(file) | |
meta, data, count_items = process_plaintext(text) | |
# Sainsity-check: does the number of items we found match the total reported by the PDF? | |
sum_items=sum([int(d[0]) if d[0].isnumeric() else 1 for d in data]) | |
print('\nfile = {}\ntotal_items = {}\nsum_items = {}\n'.format(file, count_items, sum_items)) | |
# Check count of items is as expected (if not, congrats, you found another edge case) | |
assert(count_items == sum_items) | |
data = append_delivery_coupons(meta, data) | |
pprint(data) | |
write_out(file, data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment