Skip to content

Instantly share code, notes, and snippets.

@gotev
Last active October 16, 2023 12:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gotev/e936c65a903650f00e27d698b06d092c to your computer and use it in GitHub Desktop.
Save gotev/e936c65a903650f00e27d698b06d092c to your computer and use it in GitHub Desktop.
Extract Text from PDF containing images
# Requirements:
# pip install pdf2image pytesseract Pillow
#
import os
import glob
import sys
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
if len(sys.argv) < 3:
print("Usage: python3 convert_pdf_to_text.py pdf_file output_folder")
sys.exit(1) # Exit the script with an error code
pdf_path = sys.argv[1]
folder_name = sys.argv[2]
def pdf_to_images(pdf_path, folder_name):
# Convert PDF to images
images = convert_from_path(pdf_path)
# Check if the directory already exists, and if not, create it
if not os.path.exists(folder_name):
os.mkdir(folder_name)
total_images=len(images)
current_image=1
for image in images:
print(f'Processing {current_image} of {total_images}')
image_file_name = f'{folder_name}/{current_image}.png'
image.save(image_file_name)
current_image += 1
def images_to_text(folder_path):
# Initialize Tesseract OCR
# For macOS, you can typically use the default Tesseract installation path
# If you installed Tesseract using Homebrew, the path is '/usr/local/bin/tesseract'
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
png_files = glob.glob(os.path.join(folder_path, "*.png"))
total_images = len(png_files)
current_image=1
for png_file in png_files:
print(f'Processing {current_image} of {total_images}')
text = pytesseract.image_to_string(Image.open(png_file))
with open(f'{png_file}.txt', "w") as file:
# Write the text to the file
file.write(text)
current_image += 1
print("Converting PDF to images")
pdf_to_images(pdf_path, folder_name)
print("Extracting text from images")
images_to_text(folder_name)
print(f'Text extraction completed. Check output in {folder_name}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment