portableant/splitPDFandOCR.py

## splitPdf.py
#!/usr/bin/python
## Split pdf files into pages
## Daniel Pett 11/2/2021
__author__ = 'portableant'
## Tested on Python 2.7.16


## Usage example for Lucinda
## Save this file on your machine, make sure you have python installed.
## You may need to have installed libraries to run this file eg
## pip install wand
## pip install PyPDF2
## Save your pdf file in the same directory as the python script
## This script takes 4 arguments as defined below

## An example to run it is:
## python splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed
## What does the above do - 1. Ask python to run the script 2. -p . means the path is current
## directory 3. -f means the file name to split 4. -d means the directory to create 5. -n is the base
## file name for the new files to be named

import argparse
import os

from PyPDF2 import PdfFileWriter, PdfFileReader

from wand.image import Image

parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')

# Add arguments
parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
parser.add_argument('-f', '--file', help='The file to process', required=True)
parser.add_argument('-n', '--name', help='The new file name', required=True)
parser.add_argument('-d', '--destination', help='The processed folder', required=True)

# Parse the arguments
args = parser.parse_args()

# Construct variables and print them out
path = args.path
print(path)
destination = os.path.join(path,args.destination)
print(destination)
pageName = os.path.join(destination,args.name)+'_%s.pdf'
print(pageName)
fileName = os.path.join(path,args.file)
print(fileName)

# Make the directory if it does not exist
if not os.path.exists(destination):
    os.makedirs(destination)

# Open the file
inputpdf = PdfFileReader(open( fileName, "rb"))

# Cycle through pages and create new pdfs
for i in range(inputpdf.numPages):
    output = PdfFileWriter()
    output.addPage(inputpdf.getPage(i))
    with open(pageName % (i+1), "wb") as outputStream:
        output.write(outputStream)

## splitPDFandOCR.py
#!/usr/bin/python
## Split pdf files into pages and ocr text (this is a bit honky, but works as a demo)
## Daniel Pett 11/2/2021
__author__ = 'portableant'
## Tested on Python 2.7.16
## Usage example
## python3 splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed -o ocr
## mac osx brew install poplar and echo 'export PATH="/usr/local/opt/qt/bin:$PATH"' >> ~/.zshrc

import argparse
import os
import sys
# pip install Pillow
from PIL import Image

# pip3 install pytesseract
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'


# pip3 install PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
# pip3 install pdf2image
from pdf2image import convert_from_path
# pip install wand
#from wand.image import Image

parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')
parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
parser.add_argument('-f', '--file', help='The file to process', required=True)
parser.add_argument('-n', '--name', help='The new file name', required=True)
parser.add_argument('-d', '--destination', help='The processed folder', required=True)
parser.add_argument('-o', '--ocr', help='The ocr folder', required=True)

# Parse arguments

args = parser.parse_args()

path = args.path

destination = os.path.join(path,args.destination)

ocrfolder = os.path.join(path,args.ocr)

pageName = os.path.join(destination,args.name)+'%s.pdf'

fileName = os.path.join(path,args.file)

if not os.path.exists(destination):
    os.makedirs(destination)

if not os.path.exists(ocrfolder):
    os.makedirs(ocrfolder)

if not os.path.exists('images'):
    os.makedirs('images')

inputpdf = PdfFileReader(open( fileName, "rb"))

for i in range(inputpdf.numPages):
    output = PdfFileWriter()
    output.addPage(inputpdf.getPage(i))
    with open(pageName % (i+1), "wb") as outputStream:
        output.write(outputStream)

for file in os.listdir(destination):
     filepath = os.path.join(destination,file)
     if file.endswith(".pdf"):
        img = convert_from_path(filepath)
        imgName = os.path.splitext(file)[0]
        jpgName = os.path.join('./images/',imgName + '.jpg')
        for page in img:
          page.save(jpgName, 'JPEG')
          text = pytesseract.image_to_string(Image.open(jpgName))
          ocrName = os.path.join('./ocr/',imgName + '.txt')
          with open(ocrName, mode = 'w') as f:
            f.write(text)
	#!/usr/bin/python
	## Split pdf files into pages
	## Daniel Pett 11/2/2021
	__author__ = 'portableant'
	## Tested on Python 2.7.16


	## Usage example for Lucinda
	## Save this file on your machine, make sure you have python installed.
	## You may need to have installed libraries to run this file eg
	## pip install wand
	## pip install PyPDF2
	## Save your pdf file in the same directory as the python script
	## This script takes 4 arguments as defined below

	## An example to run it is:
	## python splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed
	## What does the above do - 1. Ask python to run the script 2. -p . means the path is current
	## directory 3. -f means the file name to split 4. -d means the directory to create 5. -n is the base
	## file name for the new files to be named

	import argparse
	import os

	from PyPDF2 import PdfFileWriter, PdfFileReader

	from wand.image import Image

	parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')

	# Add arguments
	parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
	parser.add_argument('-f', '--file', help='The file to process', required=True)
	parser.add_argument('-n', '--name', help='The new file name', required=True)
	parser.add_argument('-d', '--destination', help='The processed folder', required=True)

	# Parse the arguments
	args = parser.parse_args()

	# Construct variables and print them out
	path = args.path
	print(path)
	destination = os.path.join(path,args.destination)
	print(destination)
	pageName = os.path.join(destination,args.name)+'_%s.pdf'
	print(pageName)
	fileName = os.path.join(path,args.file)
	print(fileName)

	# Make the directory if it does not exist
	if not os.path.exists(destination):
	os.makedirs(destination)

	# Open the file
	inputpdf = PdfFileReader(open( fileName, "rb"))

	# Cycle through pages and create new pdfs
	for i in range(inputpdf.numPages):
	output = PdfFileWriter()
	output.addPage(inputpdf.getPage(i))
	with open(pageName % (i+1), "wb") as outputStream:
	output.write(outputStream)
	#!/usr/bin/python
	## Split pdf files into pages and ocr text (this is a bit honky, but works as a demo)
	## Daniel Pett 11/2/2021
	__author__ = 'portableant'
	## Tested on Python 2.7.16
	## Usage example
	## python3 splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed -o ocr
	## mac osx brew install poplar and echo 'export PATH="/usr/local/opt/qt/bin:$PATH"' >> ~/.zshrc

	import argparse
	import os
	import sys
	# pip install Pillow
	from PIL import Image

	# pip3 install pytesseract
	import pytesseract
	pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'


	# pip3 install PyPDF2
	from PyPDF2 import PdfFileWriter, PdfFileReader
	# pip3 install pdf2image
	from pdf2image import convert_from_path
	# pip install wand
	#from wand.image import Image

	parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')
	parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
	parser.add_argument('-f', '--file', help='The file to process', required=True)
	parser.add_argument('-n', '--name', help='The new file name', required=True)
	parser.add_argument('-d', '--destination', help='The processed folder', required=True)
	parser.add_argument('-o', '--ocr', help='The ocr folder', required=True)

	# Parse arguments

	args = parser.parse_args()

	path = args.path

	destination = os.path.join(path,args.destination)

	ocrfolder = os.path.join(path,args.ocr)

	pageName = os.path.join(destination,args.name)+'%s.pdf'

	fileName = os.path.join(path,args.file)

	if not os.path.exists(destination):
	os.makedirs(destination)

	if not os.path.exists(ocrfolder):
	os.makedirs(ocrfolder)

	if not os.path.exists('images'):
	os.makedirs('images')

	inputpdf = PdfFileReader(open( fileName, "rb"))

	for i in range(inputpdf.numPages):
	output = PdfFileWriter()
	output.addPage(inputpdf.getPage(i))
	with open(pageName % (i+1), "wb") as outputStream:
	output.write(outputStream)

	for file in os.listdir(destination):
	filepath = os.path.join(destination,file)
	if file.endswith(".pdf"):
	img = convert_from_path(filepath)
	imgName = os.path.splitext(file)[0]
	jpgName = os.path.join('./images/',imgName + '.jpg')
	for page in img:
	page.save(jpgName, 'JPEG')
	text = pytesseract.image_to_string(Image.open(jpgName))
	ocrName = os.path.join('./ocr/',imgName + '.txt')
	with open(ocrName, mode = 'w') as f:
	f.write(text)