K4CZP3R/ns_factuur_extract.py

## ns_factuur_extract.py
import pytesseract  # type: ignore
from PIL import Image  # type: ignore
from time import sleep
import re
import fitz  # type: ignore
import json
import glob

for p in glob.glob("*.pdf"):
    pdf_file = fitz.open(p)

    zoom_size = 3

    mat = fitz.Matrix(zoom_size, zoom_size)

    page = pdf_file.load_page(1)
    pix = page.get_pixmap(matrix=mat)
    pix.save("ss.png")
    pdf_file.close()

    # Open an image
    image = Image.open("ss.png")  # Update with your image path

    # Get image dimensions
    width, height = image.size

    images = []

    step_size = 120 * zoom_size

    entries = []

    last_height = 0
    for i in range(1910 * zoom_size, height * 10, step_size):
        n = image.crop((0, last_height / 10, width, i / 10))
        # n.save("ss.png")
        images.append(n)
        last_height = i
        # sleep(0.5)

    print("Converted to images", len(images))

    for image in images:
        text = pytesseract.image_to_string(image, lang="eng").strip()

        # If text does not start with date (DD-MM-YYYY), ignore it
        if not re.match(r"\d{2}-\d{2}-\d{4}", text):
            continue
        if "Fiets" in text:
            continue
        if len(text) < 20:
            continue
        print("Matched", text)

        # Get date
        date = re.findall(r"\d{2}-\d{2}-\d{4}", text)[0]
        print("Date", date)

        transporter = text.split(" ")[1]
        print("Transporter", transporter)

        # Next is the discount, there are two variants: "20% korting in de spits" and "40% korting buiten de spits"
        discount = text.split(" ")[2]
        print("Discount", discount)

        AVAILABLE_STATIONS = [
            "Oss West",
            "'s-Hertogenbosch",
            "Tilburg",
            "Eindhoven Strijp-S",
            "Oss",
            "Eindhoven Centraal",
        ]

        # Find stations in text using regex
        stations = re.findall(r"(?=(" + "|".join(AVAILABLE_STATIONS) + r"))", text)
        print("Stations", stations)

        # After the euro sign, there is the price
        price = float(text.split("€")[1].strip().replace(",", "."))
        print("Price", price)

        obj = {
            "date": date,
            "transporter": transporter,
            "discount": discount,
            "from": stations[0],
            "to": stations[1],
            "price": price,
        }
        print(obj)
        entries.append(obj)

    current_entries = open("entries.json", "r").read()
    current_entries = json.loads(current_entries)
    # check if array is empty
    if len(current_entries) == 0:
        current_entries = []  # type: ignore

    entries = current_entries + entries  # type: ignore

    with open("entries.json", "w") as f:
        f.write(json.dumps(entries))
	import pytesseract # type: ignore
	from PIL import Image # type: ignore
	from time import sleep
	import re
	import fitz # type: ignore
	import json
	import glob

	for p in glob.glob("*.pdf"):
	pdf_file = fitz.open(p)

	zoom_size = 3

	mat = fitz.Matrix(zoom_size, zoom_size)

	page = pdf_file.load_page(1)
	pix = page.get_pixmap(matrix=mat)
	pix.save("ss.png")
	pdf_file.close()

	# Open an image
	image = Image.open("ss.png") # Update with your image path

	# Get image dimensions
	width, height = image.size

	images = []

	step_size = 120 * zoom_size

	entries = []

	last_height = 0
	for i in range(1910 * zoom_size, height * 10, step_size):
	n = image.crop((0, last_height / 10, width, i / 10))
	# n.save("ss.png")
	images.append(n)
	last_height = i
	# sleep(0.5)

	print("Converted to images", len(images))

	for image in images:
	text = pytesseract.image_to_string(image, lang="eng").strip()

	# If text does not start with date (DD-MM-YYYY), ignore it
	if not re.match(r"\d{2}-\d{2}-\d{4}", text):
	continue
	if "Fiets" in text:
	continue
	if len(text) < 20:
	continue
	print("Matched", text)

	# Get date
	date = re.findall(r"\d{2}-\d{2}-\d{4}", text)[0]
	print("Date", date)

	transporter = text.split(" ")[1]
	print("Transporter", transporter)

	# Next is the discount, there are two variants: "20% korting in de spits" and "40% korting buiten de spits"
	discount = text.split(" ")[2]
	print("Discount", discount)

	AVAILABLE_STATIONS = [
	"Oss West",
	"'s-Hertogenbosch",
	"Tilburg",
	"Eindhoven Strijp-S",
	"Oss",
	"Eindhoven Centraal",
	]

	# Find stations in text using regex
	stations = re.findall(r"(?=(" + "\|".join(AVAILABLE_STATIONS) + r"))", text)
	print("Stations", stations)

	# After the euro sign, there is the price
	price = float(text.split("€")[1].strip().replace(",", "."))
	print("Price", price)

	obj = {
	"date": date,
	"transporter": transporter,
	"discount": discount,
	"from": stations[0],
	"to": stations[1],
	"price": price,
	}
	print(obj)
	entries.append(obj)

	current_entries = open("entries.json", "r").read()
	current_entries = json.loads(current_entries)
	# check if array is empty
	if len(current_entries) == 0:
	current_entries = [] # type: ignore

	entries = current_entries + entries # type: ignore

	with open("entries.json", "w") as f:
	f.write(json.dumps(entries))