Skip to content

Instantly share code, notes, and snippets.

@sjmf
Last active October 17, 2022 01:54
Show Gist options
  • Save sjmf/e1c547f0cb2cefcb814f5696b38abbe1 to your computer and use it in GitHub Desktop.
Save sjmf/e1c547f0cb2cefcb814f5696b38abbe1 to your computer and use it in GitHub Desktop.
Convert a PDF receipt from Sainsbury's website to CSV by parsing the PDF into columns
pdfplumber>=0.5.28
# -*- coding: utf-8 -*-
#
# Sainsburys PDF receipt to CSV python script
# Samantha Finnigan 2021
#
# TODO:
# * Sum everything in column 3 of the data and check if the total matches the PDF-reported subtotal via assert()
# * Support dragging files onto the script: if arguments exist and they are files, process those instead of globbing
# * Investigate/Support 'Including Savings' per 536078218
#
import sys
import re
import csv
from glob import glob
import pdfplumber
from pprint import pprint
# Read the PDF into plaintext
def read_in(file):
text = ''
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text += '\n' + page.extract_text()
return text
# Find subtotal
def get_subtotal(text):
r = re.compile(r'(Subtotal \(\d+ .+\)) £(\d{1,2}.\d{1,2})')
return r.search(text).group(2)
# Run main processing of plaintext using regexes
def process_plaintext(text):
summary = text.split('\n')
# First stage removes header (everything before 'Delivery summary' line)
# (this can include things that match the second regex!)
r = re.compile(r'^(Delivery summary \((\d+) items\))$')
while r.match(summary[0]) is None:
summary.pop(0)
# Get count of number of items from summary
count_items = int(r.match(summary.pop(0)).group(2))
# Next, trash everything after "Thanks for shopping with us"
idx = summary.index("Thanks for shopping with us")
assert(idx > 0)
summary = summary[:idx]
# Separate everything after 'Order Summary' into a separate list
r2 = re.compile(r'^Order summary')
idx = 0
while r2.match(summary[idx]) is None:
# print(idx, '\n', summary[idx])
idx += 1
meta = summary[idx:]
summary = summary[:idx]
# Final stage filters the list by regex:
r3 = re.compile(r'([\d\.]+kg|\d) ([^£]+) (£\d{1,2}.\d{1,2})')
data = r3.findall('\n'.join(summary))
# Remove \n from the data (join up lines where product name is split)
data = [(i[0], i[1].replace('\n', ' '), i[2]) for i in data]
return meta, data, count_items
# Append delivery cost and coupon discount total
def append_delivery_coupons(meta, data):
# Get delivery cost and add it to the data
r = re.compile(r'Delivery cost (£\d{1,2}.\d{1,2})')
delivery = r.findall('\n'.join(meta))[0]
print('Delivery cost: {}'.format(delivery))
data.append(('', 'Delivery', delivery))
# Get coupon discounts and add those to the data
try:
r = re.compile(r'Coupons (\-£\d{1,2}.\d{1,2})')
coupons = r.findall('\n'.join(meta))[0]
print('Coupon savings: {}'.format(coupons))
data.append(('', 'Coupons', coupons))
except IndexError as e:
print('No coupons found')
return data
# Write out data as csv
def write_out(file, data):
with open(file.split('.')[0] + '.csv','w', newline='', encoding='UTF-8') as out:
csv_out = csv.writer(out)
csv_out.writerow(['Qty','Name','Cost'])
for row in data:
try:
csv_out.writerow(row)
except UnicodeEncodeError as e:
print(e, row)
# Main
if __name__ == "__main__":
# If args not empty list
if sys.argv[1:]:
pdfs = sys.argv[1:]
else:
pdfs = glob("*.pdf")
# Iterate over pdfs
for file in pdfs:
text = read_in(file)
meta, data, count_items = process_plaintext(text)
# Sainsity-check: does the number of items we found match the total reported by the PDF?
sum_items=sum([int(d[0]) if d[0].isnumeric() else 1 for d in data])
print('\nfile = {}\ntotal_items = {}\nsum_items = {}\n'.format(file, count_items, sum_items))
# Check count of items is as expected (if not, congrats, you found another edge case)
assert(count_items == sum_items)
data = append_delivery_coupons(meta, data)
pprint(data)
write_out(file, data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment