Skip to content

Instantly share code, notes, and snippets.

@devstar0209
Last active March 6, 2024 03:27
Show Gist options
  • Save devstar0209/9dd4f52099059769592fefcf3e7583b7 to your computer and use it in GitHub Desktop.
Save devstar0209/9dd4f52099059769592fefcf3e7583b7 to your computer and use it in GitHub Desktop.
python script that add a new line when semicolon is found in PDF text
# pip install PyPDF2
## version 2
import PyPDF2
def add_newline_on_semicolon(pdf_path):
# Open the PDF file in binary mode
with open(pdf_path, 'rb') as pdf_file:
# Create a PDF reader object
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
# Initialize an empty string to store modified text
modified_text = ""
# Iterate through each page of the PDF
for page_num in range(pdf_reader.numPages):
# Extract text from the page
page_text = pdf_reader.getPage(page_num).extractText()
# Split the text into lines
lines = page_text.split('\n')
# Iterate through each line
for line in lines:
# Check if the line contains a semicolon
if ';' in line:
# Add a newline character after the semicolon
line = line.replace(';', ';\n')
# Append the modified line to the modified text
modified_text += line + '\n'
# Write the modified text to a new PDF file
with open('modified_' + pdf_path, 'w', encoding='utf-8') as output_file:
output_file.write(modified_text)
# Example usage
pdf_path = 'example.pdf' # Replace 'example.pdf' with the path to your PDF file
add_newline_on_semicolon(pdf_path)
## version 3
def extract_text_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
pages = reader.pages
document_text = ""
print(pages[1].extract_text())
for page in pages:
text = page.extract_text()
document_text += text
return document_text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment