Skip to content

Instantly share code, notes, and snippets.

@fractaledmind
Last active July 9, 2019 04:48
Show Gist options
  • Save fractaledmind/7538f031ebf1eaf1367a to your computer and use it in GitHub Desktop.
Save fractaledmind/7538f031ebf1eaf1367a to your computer and use it in GitHub Desktop.
Access to common `xpdf` CLI utilities, but adds ability to determine if PDF has been OCRd. Built for an Alfred Workflow
#!/usr/bin/python
# encoding: utf-8
#
# Copyright (c) 2014 Stephen Margheim <stephen.margheim@gmail.com>
#
# MIT Licence. See http://opensource.org/licenses/MIT
#
# Created on 12-08-2014
#
import os
import re
import sys
import subprocess
# SET THIS TO THE DIRECTORY THAT HOLDS YOUR `xpdf` CLI UTILITIES
UTILS = '/Users/smargheim/Dropbox/Alfred.alfredpreferences/workflows/user.workflow.0B84B188-4850-403C-BC75-FFC12ADBFC45/bin64'
def utilities():
"""Returns dictionary of paths to all internal PDF utilities.
"""
utils = {}
for (dirpath, dirnames, filenames) in os.walk(UTILS):
for filename in filenames:
util = filename.replace('pdf', '')
utils[util] = os.path.join(dirpath, filename)
return utils
class PDF(object):
"""Represents a Portable Document Format document.
"""
def __init__(self, pdf):
self.pdf = pdf
self.check()
@property
def info(self):
"""Extract the metadata for the given `pdf`.
Returns metainfo in a dictionary.
This function parses the text output that looks like this:
Title: PUBLIC MEETING AGENDA
Author: Customer Support
Creator: Microsoft Word 2010
Producer: Microsoft Word 2010
CreationDate: Thu Dec 20 14:44:56 2012
ModDate: Thu Dec 20 14:44:56 2012
Tagged: yes
Pages: 2
Encrypted: no
Page size: 612 x 792 pts (letter)
File size: 104739 bytes
Optimized: no
PDF version: 1.5
"""
cmd = utilities()['info']
output = {}
cmd_output = subprocess.check_output([cmd, self.pdf])
for line in cmd_output.splitlines():
# split `key` from `val` intelligently
key, val = [x.strip() for x in line.split(':', 1)]
output[key] = val
return output
@property
def text(self):
"""Extracts the plain text (if available) of the given `pdf`.
"""
cmd = utilities()['totext']
try:
cmd_output = subprocess.check_output([cmd, '-q', self.pdf, '-'])
except subprocess.CalledProcessError:
cmd_output = '\x0c'
# check if text is only Line Feeds (any number of them)
if any(c.isalpha() for c in cmd_output):
return cmd_output.replace('\x0c', '')
else:
return None
@property
def html(self):
"""Extracts the HTML (if available) of the given `pdf`.
"""
cmd = utilities()['tohtml']
pdfname = os.path.splitext(self.pdf)[0]
pdfname = pdfname.split('/')[-1]
try:
subprocess.check_output([cmd, '-q', self.pdf])
except subprocess.CalledProcessError:
out_path = False
return out_path
@property
def fonts(self):
"""Extracts the font information (if available) of the given `pdf`.
This function parses text output that looks like this:
name type emb sub uni object ID
------------------------------- ----------------- --- --- --- ---------
HJNFLI+AdvP5D8B Type 1C yes yes no 126 0
HJNDDC+Advpn800d Type 1C yes yes no 130 0
HJNGMO+AdvLogo Type 1C yes yes no 133 0
HJNGEB+Advp404fe Type 1C yes yes no 68 0
HJNFPE+Advpn8010 Type 1C yes yes no 69 0
HJNHHI+Advmp13 Type 1C yes yes no 71 0
HKLEGN+Advp404fe Type 1C yes yes no 72 0
HKLEDN+Advpn800d Type 1C yes yes no 75 0
HKLBEE+Advpn8010 Type 1C yes yes no 92 0
HKMCLM+Advhg Type 1C yes yes no 88 0
"""
cmd = utilities()['fonts']
output = []
cmd_output = subprocess.check_output([cmd, self.pdf])
for i, line in enumerate(cmd_output.splitlines()):
if i == 0:
# get names of columns as `keys`
keys = line.split()
else:
if '-----' in line:
# ignore separator line
pass
else:
# split result rows intelligently into `vals`
partial_vals = re.split(r'\s{3,}', line)
vals = partial_vals[:2]
vals.extend(partial_vals[2].split())
vals.extend(partial_vals[3].split())
output.append(dict(zip(keys, vals)))
return output
def is_ocrd(self):
"""Check is given `pdf` is OCRd.
"""
if self.text:
perc = self.ocrd_all()
if isinstance(perc, bool):
return perc
else:
if perc > 50:
return True
else:
return False
else:
return False
def ocrd_all(self):
"""Check every page for OCR.
"""
total_pages = range(int(self.info['Pages']))
return self._pages_ocrd(total_pages)
def ocrd_half(self):
"""Check if random pages have text.
"""
total_pages = int(self.info['Pages'])
pages = range(total_pages)[::2]
return self._pages_ocrd(pages)
def _pages_ocrd(self, pages):
"""Check whether given pages of `pdf` have text.
"""
cmd = utilities()['totext']
results = []
for page in pages:
# Extract text of PDF page
try:
cmd_output = subprocess.check_output([cmd, '-q',
"-f", str(page), "-l", str(page),
self.pdf, '-'])
except subprocess.CalledProcessError:
cmd_output = '\x0c'
if any(c.isalpha() for c in cmd_output):
results.append(True)
else:
results.append(False)
if len(set(results)) == 1:
return results[0]
else:
if self.fonts == []:
return False
ocrd = results.count(True)
return self.percentage(ocrd, len(pages))
def check(self):
"""Ensure input pdf path is valid.
"""
if not os.path.exists(self.pdf):
raise RuntimeError('Provided input file not found: %s' % self.pdf)
@staticmethod
def percentage(part, whole):
"""Return float with two decimal points of percentage.
"""
if whole == 0:
return 0
else:
perc = 100 * float(part)/float(whole)
return float("{0:.2f}".format(perc))
def main():
"""Iterate thru PDF files."""
#pdfs = '/Users/smargheim/Documents/PDFs/Non-OCR/'
#for (dirpath, dirnames, filenames) in os.walk(pdfs):
# for filename in filenames:
# if filename.endswith('pdf'):
# pdf = os.path.join(dirpath, filename)
# p = PDF(wf, pdf)
# if p.is_ocrd():
# print pdf
pdf = sys.argv[1]
#pdf = '/Users/smargheim/Documents/PDFs/Non-OCR/Ch. 1-2.pdf'
print PDF(pdf).is_ocrd()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment