devstar0209/PDFmodify.py

## PDFmodify.py
# pip install PyPDF2
## version 2
import PyPDF2

def add_newline_on_semicolon(pdf_path):
    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as pdf_file:
        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)

        # Initialize an empty string to store modified text
        modified_text = ""

        # Iterate through each page of the PDF
        for page_num in range(pdf_reader.numPages):
            # Extract text from the page
            page_text = pdf_reader.getPage(page_num).extractText()

            # Split the text into lines
            lines = page_text.split('\n')

            # Iterate through each line
            for line in lines:
                # Check if the line contains a semicolon
                if ';' in line:
                    # Add a newline character after the semicolon
                    line = line.replace(';', ';\n')

                # Append the modified line to the modified text
                modified_text += line + '\n'

    # Write the modified text to a new PDF file
    with open('modified_' + pdf_path, 'w', encoding='utf-8') as output_file:
        output_file.write(modified_text)

# Example usage
pdf_path = 'example.pdf'  # Replace 'example.pdf' with the path to your PDF file
add_newline_on_semicolon(pdf_path)


## version 3
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        pages = reader.pages
        document_text = ""

        print(pages[1].extract_text())
        for page in pages:
            text = page.extract_text()
            document_text += text

        return document_text
	# pip install PyPDF2
	## version 2
	import PyPDF2

	def add_newline_on_semicolon(pdf_path):
	# Open the PDF file in binary mode
	with open(pdf_path, 'rb') as pdf_file:
	# Create a PDF reader object
	pdf_reader = PyPDF2.PdfFileReader(pdf_file)

	# Initialize an empty string to store modified text
	modified_text = ""

	# Iterate through each page of the PDF
	for page_num in range(pdf_reader.numPages):
	# Extract text from the page
	page_text = pdf_reader.getPage(page_num).extractText()

	# Split the text into lines
	lines = page_text.split('\n')

	# Iterate through each line
	for line in lines:
	# Check if the line contains a semicolon
	if ';' in line:
	# Add a newline character after the semicolon
	line = line.replace(';', ';\n')

	# Append the modified line to the modified text
	modified_text += line + '\n'

	# Write the modified text to a new PDF file
	with open('modified_' + pdf_path, 'w', encoding='utf-8') as output_file:
	output_file.write(modified_text)

	# Example usage
	pdf_path = 'example.pdf' # Replace 'example.pdf' with the path to your PDF file
	add_newline_on_semicolon(pdf_path)


	## version 3
	def extract_text_from_pdf(pdf_path):
	with open(pdf_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	pages = reader.pages
	document_text = ""

	print(pages[1].extract_text())
	for page in pages:
	text = page.extract_text()
	document_text += text

	return document_text