gotev/convert_pdf_to_text.py

## convert_pdf_to_text.py
# Requirements:
# pip install pdf2image pytesseract Pillow
#
import os
import glob
import sys
import pytesseract
from pdf2image import convert_from_path
from PIL import Image


if len(sys.argv) < 3:
    print("Usage: python3 convert_pdf_to_text.py pdf_file output_folder")
    sys.exit(1)  # Exit the script with an error code


pdf_path = sys.argv[1]
folder_name = sys.argv[2]


def pdf_to_images(pdf_path, folder_name):
    # Convert PDF to images
    images = convert_from_path(pdf_path)

    # Check if the directory already exists, and if not, create it
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)

    total_images=len(images)
    current_image=1

    for image in images:
        print(f'Processing {current_image} of {total_images}')
        image_file_name = f'{folder_name}/{current_image}.png'
        image.save(image_file_name)
        current_image += 1


def images_to_text(folder_path):
    # Initialize Tesseract OCR
    # For macOS, you can typically use the default Tesseract installation path
    # If you installed Tesseract using Homebrew, the path is '/usr/local/bin/tesseract'
    pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'

    png_files = glob.glob(os.path.join(folder_path, "*.png"))

    total_images = len(png_files)
    current_image=1

    for png_file in png_files:
        print(f'Processing {current_image} of {total_images}')
        text = pytesseract.image_to_string(Image.open(png_file))

        with open(f'{png_file}.txt', "w") as file:
            # Write the text to the file
            file.write(text)

        current_image += 1


print("Converting PDF to images")
pdf_to_images(pdf_path, folder_name)
print("Extracting text from images")
images_to_text(folder_name)


print(f'Text extraction completed. Check output in {folder_name}')
	# Requirements:
	# pip install pdf2image pytesseract Pillow
	#
	import os
	import glob
	import sys
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image


	if len(sys.argv) < 3:
	print("Usage: python3 convert_pdf_to_text.py pdf_file output_folder")
	sys.exit(1) # Exit the script with an error code


	pdf_path = sys.argv[1]
	folder_name = sys.argv[2]


	def pdf_to_images(pdf_path, folder_name):
	# Convert PDF to images
	images = convert_from_path(pdf_path)

	# Check if the directory already exists, and if not, create it
	if not os.path.exists(folder_name):
	os.mkdir(folder_name)

	total_images=len(images)
	current_image=1

	for image in images:
	print(f'Processing {current_image} of {total_images}')
	image_file_name = f'{folder_name}/{current_image}.png'
	image.save(image_file_name)
	current_image += 1


	def images_to_text(folder_path):
	# Initialize Tesseract OCR
	# For macOS, you can typically use the default Tesseract installation path
	# If you installed Tesseract using Homebrew, the path is '/usr/local/bin/tesseract'
	pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'

	png_files = glob.glob(os.path.join(folder_path, "*.png"))

	total_images = len(png_files)
	current_image=1

	for png_file in png_files:
	print(f'Processing {current_image} of {total_images}')
	text = pytesseract.image_to_string(Image.open(png_file))

	with open(f'{png_file}.txt', "w") as file:
	# Write the text to the file
	file.write(text)

	current_image += 1


	print("Converting PDF to images")
	pdf_to_images(pdf_path, folder_name)
	print("Extracting text from images")
	images_to_text(folder_name)


	print(f'Text extraction completed. Check output in {folder_name}')