Skip to content

Instantly share code, notes, and snippets.

@portableant
Last active February 13, 2021 00:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save portableant/a4f242878ca863d12d811ebd8c2dea4a to your computer and use it in GitHub Desktop.
Save portableant/a4f242878ca863d12d811ebd8c2dea4a to your computer and use it in GitHub Desktop.
Hacky script for splitting pdf to pages
#!/usr/bin/python
## Split pdf files into pages
## Daniel Pett 11/2/2021
__author__ = 'portableant'
## Tested on Python 2.7.16
## Usage example for Lucinda
## Save this file on your machine, make sure you have python installed.
## You may need to have installed libraries to run this file eg
## pip install wand
## pip install PyPDF2
## Save your pdf file in the same directory as the python script
## This script takes 4 arguments as defined below
## An example to run it is:
## python splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed
## What does the above do - 1. Ask python to run the script 2. -p . means the path is current
## directory 3. -f means the file name to split 4. -d means the directory to create 5. -n is the base
## file name for the new files to be named
import argparse
import os
from PyPDF2 import PdfFileWriter, PdfFileReader
from wand.image import Image
parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')
# Add arguments
parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
parser.add_argument('-f', '--file', help='The file to process', required=True)
parser.add_argument('-n', '--name', help='The new file name', required=True)
parser.add_argument('-d', '--destination', help='The processed folder', required=True)
# Parse the arguments
args = parser.parse_args()
# Construct variables and print them out
path = args.path
print(path)
destination = os.path.join(path,args.destination)
print(destination)
pageName = os.path.join(destination,args.name)+'_%s.pdf'
print(pageName)
fileName = os.path.join(path,args.file)
print(fileName)
# Make the directory if it does not exist
if not os.path.exists(destination):
os.makedirs(destination)
# Open the file
inputpdf = PdfFileReader(open( fileName, "rb"))
# Cycle through pages and create new pdfs
for i in range(inputpdf.numPages):
output = PdfFileWriter()
output.addPage(inputpdf.getPage(i))
with open(pageName % (i+1), "wb") as outputStream:
output.write(outputStream)
#!/usr/bin/python
## Split pdf files into pages and ocr text (this is a bit honky, but works as a demo)
## Daniel Pett 11/2/2021
__author__ = 'portableant'
## Tested on Python 2.7.16
## Usage example
## python3 splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed -o ocr
## mac osx brew install poplar and echo 'export PATH="/usr/local/opt/qt/bin:$PATH"' >> ~/.zshrc
import argparse
import os
import sys
# pip install Pillow
from PIL import Image
# pip3 install pytesseract
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'
# pip3 install PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
# pip3 install pdf2image
from pdf2image import convert_from_path
# pip install wand
#from wand.image import Image
parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')
parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
parser.add_argument('-f', '--file', help='The file to process', required=True)
parser.add_argument('-n', '--name', help='The new file name', required=True)
parser.add_argument('-d', '--destination', help='The processed folder', required=True)
parser.add_argument('-o', '--ocr', help='The ocr folder', required=True)
# Parse arguments
args = parser.parse_args()
path = args.path
destination = os.path.join(path,args.destination)
ocrfolder = os.path.join(path,args.ocr)
pageName = os.path.join(destination,args.name)+'%s.pdf'
fileName = os.path.join(path,args.file)
if not os.path.exists(destination):
os.makedirs(destination)
if not os.path.exists(ocrfolder):
os.makedirs(ocrfolder)
if not os.path.exists('images'):
os.makedirs('images')
inputpdf = PdfFileReader(open( fileName, "rb"))
for i in range(inputpdf.numPages):
output = PdfFileWriter()
output.addPage(inputpdf.getPage(i))
with open(pageName % (i+1), "wb") as outputStream:
output.write(outputStream)
for file in os.listdir(destination):
filepath = os.path.join(destination,file)
if file.endswith(".pdf"):
img = convert_from_path(filepath)
imgName = os.path.splitext(file)[0]
jpgName = os.path.join('./images/',imgName + '.jpg')
for page in img:
page.save(jpgName, 'JPEG')
text = pytesseract.image_to_string(Image.open(jpgName))
ocrName = os.path.join('./ocr/',imgName + '.txt')
with open(ocrName, mode = 'w') as f:
f.write(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment