Skip to content

Instantly share code, notes, and snippets.

Created November 13, 2023 22:14
Show Gist options
  • Save K4CZP3R/b7da70264da4e418b66baaa25d39188e to your computer and use it in GitHub Desktop.
Save K4CZP3R/b7da70264da4e418b66baaa25d39188e to your computer and use it in GitHub Desktop.
This script converts pdf "factuur" from to JSON array with date, transporter, discount, from, to and price. Hacky, but it works. Station parsing needs to be done better.
import pytesseract # type: ignore
from PIL import Image # type: ignore
from time import sleep
import re
import fitz # type: ignore
import json
import glob
for p in glob.glob("*.pdf"):
pdf_file =
zoom_size = 3
mat = fitz.Matrix(zoom_size, zoom_size)
page = pdf_file.load_page(1)
pix = page.get_pixmap(matrix=mat)"ss.png")
# Open an image
image ="ss.png") # Update with your image path
# Get image dimensions
width, height = image.size
images = []
step_size = 120 * zoom_size
entries = []
last_height = 0
for i in range(1910 * zoom_size, height * 10, step_size):
n = image.crop((0, last_height / 10, width, i / 10))
last_height = i
# sleep(0.5)
print("Converted to images", len(images))
for image in images:
text = pytesseract.image_to_string(image, lang="eng").strip()
# If text does not start with date (DD-MM-YYYY), ignore it
if not re.match(r"\d{2}-\d{2}-\d{4}", text):
if "Fiets" in text:
if len(text) < 20:
print("Matched", text)
# Get date
date = re.findall(r"\d{2}-\d{2}-\d{4}", text)[0]
print("Date", date)
transporter = text.split(" ")[1]
print("Transporter", transporter)
# Next is the discount, there are two variants: "20% korting in de spits" and "40% korting buiten de spits"
discount = text.split(" ")[2]
print("Discount", discount)
"Oss West",
"Eindhoven Strijp-S",
"Eindhoven Centraal",
# Find stations in text using regex
stations = re.findall(r"(?=(" + "|".join(AVAILABLE_STATIONS) + r"))", text)
print("Stations", stations)
# After the euro sign, there is the price
price = float(text.split("€")[1].strip().replace(",", "."))
print("Price", price)
obj = {
"date": date,
"transporter": transporter,
"discount": discount,
"from": stations[0],
"to": stations[1],
"price": price,
current_entries = open("entries.json", "r").read()
current_entries = json.loads(current_entries)
# check if array is empty
if len(current_entries) == 0:
current_entries = [] # type: ignore
entries = current_entries + entries # type: ignore
with open("entries.json", "w") as f:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment