Skip to content

Instantly share code, notes, and snippets.

@K4CZP3R
Created November 13, 2023 22:14
Show Gist options
  • Save K4CZP3R/b7da70264da4e418b66baaa25d39188e to your computer and use it in GitHub Desktop.
Save K4CZP3R/b7da70264da4e418b66baaa25d39188e to your computer and use it in GitHub Desktop.
This script converts pdf "factuur" from https://www.ns.nl/mijnns#/betaaloverzicht to JSON array with date, transporter, discount, from, to and price. Hacky, but it works. Station parsing needs to be done better.
import pytesseract # type: ignore
from PIL import Image # type: ignore
from time import sleep
import re
import fitz # type: ignore
import json
import glob
for p in glob.glob("*.pdf"):
pdf_file = fitz.open(p)
zoom_size = 3
mat = fitz.Matrix(zoom_size, zoom_size)
page = pdf_file.load_page(1)
pix = page.get_pixmap(matrix=mat)
pix.save("ss.png")
pdf_file.close()
# Open an image
image = Image.open("ss.png") # Update with your image path
# Get image dimensions
width, height = image.size
images = []
step_size = 120 * zoom_size
entries = []
last_height = 0
for i in range(1910 * zoom_size, height * 10, step_size):
n = image.crop((0, last_height / 10, width, i / 10))
# n.save("ss.png")
images.append(n)
last_height = i
# sleep(0.5)
print("Converted to images", len(images))
for image in images:
text = pytesseract.image_to_string(image, lang="eng").strip()
# If text does not start with date (DD-MM-YYYY), ignore it
if not re.match(r"\d{2}-\d{2}-\d{4}", text):
continue
if "Fiets" in text:
continue
if len(text) < 20:
continue
print("Matched", text)
# Get date
date = re.findall(r"\d{2}-\d{2}-\d{4}", text)[0]
print("Date", date)
transporter = text.split(" ")[1]
print("Transporter", transporter)
# Next is the discount, there are two variants: "20% korting in de spits" and "40% korting buiten de spits"
discount = text.split(" ")[2]
print("Discount", discount)
AVAILABLE_STATIONS = [
"Oss West",
"'s-Hertogenbosch",
"Tilburg",
"Eindhoven Strijp-S",
"Oss",
"Eindhoven Centraal",
]
# Find stations in text using regex
stations = re.findall(r"(?=(" + "|".join(AVAILABLE_STATIONS) + r"))", text)
print("Stations", stations)
# After the euro sign, there is the price
price = float(text.split("€")[1].strip().replace(",", "."))
print("Price", price)
obj = {
"date": date,
"transporter": transporter,
"discount": discount,
"from": stations[0],
"to": stations[1],
"price": price,
}
print(obj)
entries.append(obj)
current_entries = open("entries.json", "r").read()
current_entries = json.loads(current_entries)
# check if array is empty
if len(current_entries) == 0:
current_entries = [] # type: ignore
entries = current_entries + entries # type: ignore
with open("entries.json", "w") as f:
f.write(json.dumps(entries))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment