Last active
March 6, 2024 03:27
-
-
Save devstar0209/9dd4f52099059769592fefcf3e7583b7 to your computer and use it in GitHub Desktop.
python script that add a new line when semicolon is found in PDF text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install PyPDF2 | |
## version 2 | |
import PyPDF2 | |
def add_newline_on_semicolon(pdf_path): | |
# Open the PDF file in binary mode | |
with open(pdf_path, 'rb') as pdf_file: | |
# Create a PDF reader object | |
pdf_reader = PyPDF2.PdfFileReader(pdf_file) | |
# Initialize an empty string to store modified text | |
modified_text = "" | |
# Iterate through each page of the PDF | |
for page_num in range(pdf_reader.numPages): | |
# Extract text from the page | |
page_text = pdf_reader.getPage(page_num).extractText() | |
# Split the text into lines | |
lines = page_text.split('\n') | |
# Iterate through each line | |
for line in lines: | |
# Check if the line contains a semicolon | |
if ';' in line: | |
# Add a newline character after the semicolon | |
line = line.replace(';', ';\n') | |
# Append the modified line to the modified text | |
modified_text += line + '\n' | |
# Write the modified text to a new PDF file | |
with open('modified_' + pdf_path, 'w', encoding='utf-8') as output_file: | |
output_file.write(modified_text) | |
# Example usage | |
pdf_path = 'example.pdf' # Replace 'example.pdf' with the path to your PDF file | |
add_newline_on_semicolon(pdf_path) | |
## version 3 | |
def extract_text_from_pdf(pdf_path): | |
with open(pdf_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
pages = reader.pages | |
document_text = "" | |
print(pages[1].extract_text()) | |
for page in pages: | |
text = page.extract_text() | |
document_text += text | |
return document_text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment