sjmf/requirements.txt

## requirements.txt
pdfplumber>=0.5.28

## sains2csv.py
# -*- coding: utf-8 -*-
#
# Sainsburys PDF receipt to CSV python script
# Samantha Finnigan 2021
#
# TODO:
#  * Sum everything in column 3 of the data and check if the total matches the PDF-reported subtotal via assert()
#  * Support dragging files onto the script: if arguments exist and they are files, process those instead of globbing
#  * Investigate/Support 'Including Savings' per 536078218
#
import sys
import re
import csv
from glob import glob
import pdfplumber
from pprint import pprint


# Read the PDF into plaintext
def read_in(file):
    text = ''
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += '\n' + page.extract_text()
    return text


# Find subtotal
def get_subtotal(text):
    r = re.compile(r'(Subtotal \(\d+ .+\)) £(\d{1,2}.\d{1,2})')
    return r.search(text).group(2)


# Run main processing of plaintext using regexes
def process_plaintext(text):
    summary = text.split('\n')

    # First stage removes header (everything before 'Delivery summary' line)
    # (this can include things that match the second regex!)
    r = re.compile(r'^(Delivery summary \((\d+) items\))$')
    while r.match(summary[0]) is None:
        summary.pop(0)

    # Get count of number of items from summary
    count_items = int(r.match(summary.pop(0)).group(2))

    # Next, trash everything after "Thanks for shopping with us"
    idx = summary.index("Thanks for shopping with us")
    assert(idx > 0)
    summary = summary[:idx]

    # Separate everything after 'Order Summary' into a separate list
    r2 = re.compile(r'^Order summary')
    idx = 0
    while r2.match(summary[idx]) is None:
        # print(idx, '\n', summary[idx])
        idx += 1
    meta = summary[idx:]
    summary = summary[:idx]

    # Final stage filters the list by regex:
    r3 = re.compile(r'([\d\.]+kg|\d) ([^£]+) (£\d{1,2}.\d{1,2})')
    data = r3.findall('\n'.join(summary))

    # Remove \n from the data (join up lines where product name is split)
    data = [(i[0], i[1].replace('\n', ' '), i[2]) for i in data]

    return meta, data, count_items


# Append delivery cost and coupon discount total
def append_delivery_coupons(meta, data):
    # Get delivery cost and add it to the data
    r = re.compile(r'Delivery cost (£\d{1,2}.\d{1,2})')
    delivery = r.findall('\n'.join(meta))[0]
    print('Delivery cost: {}'.format(delivery))
    data.append(('', 'Delivery', delivery))

    # Get coupon discounts and add those to the data
    try:
        r = re.compile(r'Coupons (\-£\d{1,2}.\d{1,2})')
        coupons = r.findall('\n'.join(meta))[0]
        print('Coupon savings: {}'.format(coupons))
        data.append(('', 'Coupons', coupons))
    except IndexError as e:
        print('No coupons found')

    return data


# Write out data as csv
def write_out(file, data):
    with open(file.split('.')[0] + '.csv','w', newline='', encoding='UTF-8') as out:
        csv_out = csv.writer(out)
        csv_out.writerow(['Qty','Name','Cost'])
        for row in data:
            try:
                csv_out.writerow(row)
            except UnicodeEncodeError as e:
                print(e, row)


# Main
if __name__ == "__main__":

    # If args not empty list
    if sys.argv[1:]:
        pdfs = sys.argv[1:]
    else:
        pdfs = glob("*.pdf")

    # Iterate over pdfs
    for file in pdfs:
        text = read_in(file)
        meta, data, count_items = process_plaintext(text)

        # Sainsity-check: does the number of items we found match the total reported by the PDF?
        sum_items=sum([int(d[0]) if d[0].isnumeric() else 1 for d in data])
        print('\nfile = {}\ntotal_items = {}\nsum_items = {}\n'.format(file, count_items, sum_items))

        # Check count of items is as expected (if not, congrats, you found another edge case)
        assert(count_items == sum_items)

        data = append_delivery_coupons(meta, data)

        pprint(data)
        write_out(file, data)
	# -- coding: utf-8 --
	#
	# Sainsburys PDF receipt to CSV python script
	# Samantha Finnigan 2021
	#
	# TODO:
	# * Sum everything in column 3 of the data and check if the total matches the PDF-reported subtotal via assert()
	# * Support dragging files onto the script: if arguments exist and they are files, process those instead of globbing
	# * Investigate/Support 'Including Savings' per 536078218
	#
	import sys
	import re
	import csv
	from glob import glob
	import pdfplumber
	from pprint import pprint


	# Read the PDF into plaintext
	def read_in(file):
	text = ''
	with pdfplumber.open(file) as pdf:
	for page in pdf.pages:
	text += '\n' + page.extract_text()
	return text


	# Find subtotal
	def get_subtotal(text):
	r = re.compile(r'(Subtotal \(\d+ .+\)) £(\d{1,2}.\d{1,2})')
	return r.search(text).group(2)


	# Run main processing of plaintext using regexes
	def process_plaintext(text):
	summary = text.split('\n')

	# First stage removes header (everything before 'Delivery summary' line)
	# (this can include things that match the second regex!)
	r = re.compile(r'^(Delivery summary \((\d+) items\))$')
	while r.match(summary[0]) is None:
	summary.pop(0)

	# Get count of number of items from summary
	count_items = int(r.match(summary.pop(0)).group(2))

	# Next, trash everything after "Thanks for shopping with us"
	idx = summary.index("Thanks for shopping with us")
	assert(idx > 0)
	summary = summary[:idx]

	# Separate everything after 'Order Summary' into a separate list
	r2 = re.compile(r'^Order summary')
	idx = 0
	while r2.match(summary[idx]) is None:
	# print(idx, '\n', summary[idx])
	idx += 1
	meta = summary[idx:]
	summary = summary[:idx]

	# Final stage filters the list by regex:
	r3 = re.compile(r'([\d\.]+kg\|\d) ([^£]+) (£\d{1,2}.\d{1,2})')
	data = r3.findall('\n'.join(summary))

	# Remove \n from the data (join up lines where product name is split)
	data = [(i[0], i[1].replace('\n', ' '), i[2]) for i in data]

	return meta, data, count_items


	# Append delivery cost and coupon discount total
	def append_delivery_coupons(meta, data):
	# Get delivery cost and add it to the data
	r = re.compile(r'Delivery cost (£\d{1,2}.\d{1,2})')
	delivery = r.findall('\n'.join(meta))[0]
	print('Delivery cost: {}'.format(delivery))
	data.append(('', 'Delivery', delivery))

	# Get coupon discounts and add those to the data
	try:
	r = re.compile(r'Coupons (\-£\d{1,2}.\d{1,2})')
	coupons = r.findall('\n'.join(meta))[0]
	print('Coupon savings: {}'.format(coupons))
	data.append(('', 'Coupons', coupons))
	except IndexError as e:
	print('No coupons found')

	return data


	# Write out data as csv
	def write_out(file, data):
	with open(file.split('.')[0] + '.csv','w', newline='', encoding='UTF-8') as out:
	csv_out = csv.writer(out)
	csv_out.writerow(['Qty','Name','Cost'])
	for row in data:
	try:
	csv_out.writerow(row)
	except UnicodeEncodeError as e:
	print(e, row)


	# Main
	if __name__ == "__main__":

	# If args not empty list
	if sys.argv[1:]:
	pdfs = sys.argv[1:]
	else:
	pdfs = glob("*.pdf")

	# Iterate over pdfs
	for file in pdfs:
	text = read_in(file)
	meta, data, count_items = process_plaintext(text)

	# Sainsity-check: does the number of items we found match the total reported by the PDF?
	sum_items=sum([int(d[0]) if d[0].isnumeric() else 1 for d in data])
	print('\nfile = {}\ntotal_items = {}\nsum_items = {}\n'.format(file, count_items, sum_items))

	# Check count of items is as expected (if not, congrats, you found another edge case)
	assert(count_items == sum_items)

	data = append_delivery_coupons(meta, data)

	pprint(data)
	write_out(file, data)