Last active
June 7, 2020 12:12
-
-
Save serrasqueiro/62bf3ce8fcbd63433b75fb25b51addda to your computer and use it in GitHub Desktop.
read_pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# (c)2020 Henrique Moreira | |
""" read_pdf.py, a simple PDF reader | |
""" | |
import sys | |
import os.path | |
from sys import stdout, stderr | |
import PyPDF2 | |
import textract | |
# pylint: disable=missing-function-docstring | |
def main(): | |
outfile = stdout | |
is_ok = read_pdf(outfile, stderr, sys.argv[1:]) | |
sys.exit(0 if is_ok else 1) | |
def read_pdf(outfile, errfile, args): | |
assert outfile | |
assert errfile | |
param = args | |
if param: | |
filename = param[0] | |
del param[0] | |
if param: | |
outname = param[0] | |
del param[0] | |
if os.path.exists(outname): | |
errfile.write("Cowardly refusing to overwrite: {}\n" | |
"".format(outname)) | |
return False | |
outfile = open(outname, "wb") | |
else: | |
filename = "/home/henrique/test.pdf" | |
assert param == [] | |
#do_extract = True | |
text, alt_text, _ = pdf_strings(filename, errfile) | |
dump_text(outfile, errfile, text) | |
if text != alt_text: | |
errfile.write("Note: alt_text, {} byte(s)\n".format(len(alt_text))) | |
return True | |
def pdf_strings(filename, progress=None, do_extract=False): | |
#Write a for-loop to open many files (leave a comment if you'd like to learn how). | |
filename = "/home/henrique/test.pdf" | |
#open allows you to read the file. | |
p_file_obj = open(filename,'rb') | |
# pdf_read variable is a readable object that will be parsed: | |
pdf_read = PyPDF2.PdfFileReader(p_file_obj) | |
#Discerning the number of pages will allow us to parse through all the pages. | |
num_pages = pdf_read.numPages | |
line = 0 | |
text = "" | |
#The while loop will read each page. | |
if progress: | |
progress.write("Reading {} page(s)\n".format(num_pages)) | |
while True: | |
page_obj = pdf_read.getPage(line) | |
line +=1 | |
a_str = page_obj.extractText() | |
if progress: | |
progress.write("Reading page {} (text size: {})\n".format(line, len(a_str))) | |
text += a_str | |
if line >= num_pages: | |
text += "\n" | |
break | |
text += "\n\n" | |
# This if statement exists to check if the above library returned words. | |
# It's done because PyPDF2 cannot read scanned files. | |
if text == "" or do_extract: | |
alt_text = textract.process(filename, method='tesseract', language='eng') | |
# Now we have a text variable that contains all the text derived | |
# from our PDF file. Type print(text) to see what it contains. | |
# It likely contains a lot of spaces, possibly junk such as '\n,' etc. | |
else: | |
alt_text = text | |
return text, alt_text, line | |
def dump_text(outfile, errfile, text): | |
utf_chr = False | |
if not outfile: | |
return -1 | |
for ch in text: | |
bad = None | |
if ch == chr(0x2022): | |
ch = "(o)" | |
try: | |
data = bytes(ch.encode("iso-8859-1")) | |
except UnicodeEncodeError: | |
bad = ch | |
if bad: | |
msg = "[?]" | |
if errfile: | |
errfile.write("Cannot convert 0x{:4x}, assuming: {}\n".format(ord(ch), msg)) | |
data = msg.encode("ascii") | |
try: | |
outfile.write(data) | |
except TypeError: | |
utf_chr = True | |
if utf_chr: | |
outfile.write(data.decode("ascii", "ignore")) | |
return utf_chr | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment