Skip to content

Instantly share code, notes, and snippets.

@djsegal
Last active April 8, 2024 17:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save djsegal/2c6d7aaf432303a76e4829071b3c1261 to your computer and use it in GitHub Desktop.
Save djsegal/2c6d7aaf432303a76e4829071b3c1261 to your computer and use it in GitHub Desktop.
Convert PDF of CSV file to JSON with hardcoded keys for each entry
import fitz # PyMuPDF
import json
def pdf_to_json(pdf_path, json_path):
doc = fitz.open(pdf_path)
data_list = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text().replace("а́\n-", "а́-")
for line in text.split('\n'):
if line == "":
continue
parts = line.split(',', 1)
assert len(parts) == 2
lr, location = parts[0].strip(), parts[1].strip().strip("\"").strip()
data_list.append({"lr": lr, "location": location})
with open(json_path, 'w', encoding='utf-8') as json_file:
json_file.write('[\n')
for i, item in enumerate(data_list):
# Customized JSON string for each item with separators and indent for readability
json_str = json.dumps(item, ensure_ascii=False, separators=(',', ': ')).replace('","', '", "')
json_file.write(f' {json_str}')
if i < len(data_list) - 1:
json_file.write(',\n')
else:
json_file.write('\n')
json_file.write(']')
print(f"PDF content has been successfully converted to JSON and saved to: \n{json_path}")
# Example usage
pdf_path = 'path_to_your_pdf.pdf' # Update this with the actual path to your PDF file
json_path = pdf_path.replace(".pdf", ".json")
pdf_to_json(pdf_path, json_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment