Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@sapher
Created July 15, 2022 16:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sapher/033830caf4cf95b172e9c1dde10484f5 to your computer and use it in GitHub Desktop.
Save sapher/033830caf4cf95b172e9c1dde10484f5 to your computer and use it in GitHub Desktop.
Parse SUPER U monmagasin U ticket
#!/usr/bin/env python3
from asyncore import read
import argparse
from PyPDF2 import PdfReader
import re
import json
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parse super u ticket to json")
parser.add_argument('filename', help="ticket pdf filename")
args = parser.parse_args()
filepath = args.filename
reader = PdfReader(filepath)
def to_float(number):
comma = number.replace(',','.')
return float(comma)
for page in reader.pages:
lines = page.extractText().split('\n')
lines = [line for line in lines if line.strip()]
# Remove useless
is_ticket_find = False
is_end_find = False
filtered_lines = []
for line in lines:
if "ticket" in line.lower():
is_ticket_find = True
if "===" in line.lower():
is_end_find = True
if is_ticket_find and not is_end_find:
filtered_lines.append(' '.join(line.split()))
# keep only needed
filtered_lines = filtered_lines[2:]
# keep all on one line
reduced_lines = []
for index, line in enumerate(filtered_lines):
if "€" in line:
prev_line = filtered_lines[index-1]
if "€" not in prev_line:
reduced_lines.append(f"{prev_line} {line}")
else:
reduced_lines.append(line)
products = []
# parse all
for line in reduced_lines:
product = {}
# parse name
name = re.sub(r"(\s\d+,\d+\s€.*)", '', line)
product['name'] = re.sub(r"(\s\d+\sx)", '', name)
# parse prices
prices = re.findall(r"(\d+,\d+)\s€", line)
if len(prices) == 1:
product['unit_price'] = to_float(prices[0])
product['total_price'] = to_float(prices[0])
elif len(prices) == 2:
product['unit_price'] = to_float(prices[0])
product['total_price'] = to_float(prices[1])
else:
print('not handled')
products.append(product)
@sapher
Copy link
Author

sapher commented Jul 15, 2022

Just use main.py <ticket.pdf>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment