Skip to content

Instantly share code, notes, and snippets.

@serrasqueiro
Last active June 7, 2020 12:12
Show Gist options
  • Save serrasqueiro/62bf3ce8fcbd63433b75fb25b51addda to your computer and use it in GitHub Desktop.
Save serrasqueiro/62bf3ce8fcbd63433b75fb25b51addda to your computer and use it in GitHub Desktop.
read_pdf
# -*- coding: utf-8 -*-
# (c)2020 Henrique Moreira
""" read_pdf.py, a simple PDF reader
"""
import sys
import os.path
from sys import stdout, stderr
import PyPDF2
import textract
# pylint: disable=missing-function-docstring
def main():
outfile = stdout
is_ok = read_pdf(outfile, stderr, sys.argv[1:])
sys.exit(0 if is_ok else 1)
def read_pdf(outfile, errfile, args):
assert outfile
assert errfile
param = args
if param:
filename = param[0]
del param[0]
if param:
outname = param[0]
del param[0]
if os.path.exists(outname):
errfile.write("Cowardly refusing to overwrite: {}\n"
"".format(outname))
return False
outfile = open(outname, "wb")
else:
filename = "/home/henrique/test.pdf"
assert param == []
#do_extract = True
text, alt_text, _ = pdf_strings(filename, errfile)
dump_text(outfile, errfile, text)
if text != alt_text:
errfile.write("Note: alt_text, {} byte(s)\n".format(len(alt_text)))
return True
def pdf_strings(filename, progress=None, do_extract=False):
#Write a for-loop to open many files (leave a comment if you'd like to learn how).
filename = "/home/henrique/test.pdf"
#open allows you to read the file.
p_file_obj = open(filename,'rb')
# pdf_read variable is a readable object that will be parsed:
pdf_read = PyPDF2.PdfFileReader(p_file_obj)
#Discerning the number of pages will allow us to parse through all the pages.
num_pages = pdf_read.numPages
line = 0
text = ""
#The while loop will read each page.
if progress:
progress.write("Reading {} page(s)\n".format(num_pages))
while True:
page_obj = pdf_read.getPage(line)
line +=1
a_str = page_obj.extractText()
if progress:
progress.write("Reading page {} (text size: {})\n".format(line, len(a_str)))
text += a_str
if line >= num_pages:
text += "\n"
break
text += "\n\n"
# This if statement exists to check if the above library returned words.
# It's done because PyPDF2 cannot read scanned files.
if text == "" or do_extract:
alt_text = textract.process(filename, method='tesseract', language='eng')
# Now we have a text variable that contains all the text derived
# from our PDF file. Type print(text) to see what it contains.
# It likely contains a lot of spaces, possibly junk such as '\n,' etc.
else:
alt_text = text
return text, alt_text, line
def dump_text(outfile, errfile, text):
utf_chr = False
if not outfile:
return -1
for ch in text:
bad = None
if ch == chr(0x2022):
ch = "(o)"
try:
data = bytes(ch.encode("iso-8859-1"))
except UnicodeEncodeError:
bad = ch
if bad:
msg = "[?]"
if errfile:
errfile.write("Cannot convert 0x{:4x}, assuming: {}\n".format(ord(ch), msg))
data = msg.encode("ascii")
try:
outfile.write(data)
except TypeError:
utf_chr = True
if utf_chr:
outfile.write(data.decode("ascii", "ignore"))
return utf_chr
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment