serrasqueiro/read_pdf.py

## read_pdf.py
# -*- coding: utf-8 -*-

# (c)2020  Henrique Moreira
""" read_pdf.py, a simple PDF reader
"""

import sys
import os.path
from sys import stdout, stderr
import PyPDF2
import textract

# pylint: disable=missing-function-docstring


def main():
    outfile = stdout
    is_ok = read_pdf(outfile, stderr, sys.argv[1:])
    sys.exit(0 if is_ok else 1)


def read_pdf(outfile, errfile, args):
    assert outfile
    assert errfile
    param = args
    if param:
        filename = param[0]
        del param[0]
        if param:
            outname = param[0]
            del param[0]
            if os.path.exists(outname):
                errfile.write("Cowardly refusing to overwrite: {}\n"
                              "".format(outname))
                return False
            outfile = open(outname, "wb")
    else:
        filename = "/home/henrique/test.pdf"
    assert param == []
    #do_extract = True
    text, alt_text, _ = pdf_strings(filename, errfile)
    dump_text(outfile, errfile, text)
    if text != alt_text:
        errfile.write("Note: alt_text, {} byte(s)\n".format(len(alt_text)))
    return True


def pdf_strings(filename, progress=None, do_extract=False):
    #Write a for-loop to open many files (leave a comment if you'd like to learn how).
    filename = "/home/henrique/test.pdf"
    #open allows you to read the file.
    p_file_obj = open(filename,'rb')
    # pdf_read variable is a readable object that will be parsed:
    pdf_read = PyPDF2.PdfFileReader(p_file_obj)
    #Discerning the number of pages will allow us to parse through all the pages.
    num_pages = pdf_read.numPages
    line = 0
    text = ""
    #The while loop will read each page.
    if progress:
        progress.write("Reading {} page(s)\n".format(num_pages))
    while True:
        page_obj = pdf_read.getPage(line)
        line +=1
        a_str = page_obj.extractText()
        if progress:
            progress.write("Reading page {} (text size: {})\n".format(line, len(a_str)))
        text += a_str
        if line >= num_pages:
            text += "\n"
            break
        text += "\n\n"
    # This if statement exists to check if the above library returned words.
    # It's done because PyPDF2 cannot read scanned files.
    if text == "" or do_extract:
        alt_text = textract.process(filename, method='tesseract', language='eng')
    # Now we have a text variable that contains all the text derived
    # from our PDF file. Type print(text) to see what it contains.
    # It likely contains a lot of spaces, possibly junk such as '\n,' etc.
    else:
        alt_text = text
    return text, alt_text, line


def dump_text(outfile, errfile, text):
    utf_chr = False
    if not outfile:
        return -1
    for ch in text:
        bad = None
        if ch == chr(0x2022):
            ch = "(o)"
        try:
            data = bytes(ch.encode("iso-8859-1"))
        except UnicodeEncodeError:
            bad = ch
        if bad:
            msg = "[?]"
            if errfile:
                errfile.write("Cannot convert 0x{:4x}, assuming: {}\n".format(ord(ch), msg))
            data = msg.encode("ascii")
        try:
            outfile.write(data)
        except TypeError:
            utf_chr = True
        if utf_chr:
            outfile.write(data.decode("ascii", "ignore"))
    return utf_chr


if __name__ == "__main__":
    main()
	# -- coding: utf-8 --

	# (c)2020 Henrique Moreira
	""" read_pdf.py, a simple PDF reader
	"""

	import sys
	import os.path
	from sys import stdout, stderr
	import PyPDF2
	import textract

	# pylint: disable=missing-function-docstring


	def main():
	outfile = stdout
	is_ok = read_pdf(outfile, stderr, sys.argv[1:])
	sys.exit(0 if is_ok else 1)


	def read_pdf(outfile, errfile, args):
	assert outfile
	assert errfile
	param = args
	if param:
	filename = param[0]
	del param[0]
	if param:
	outname = param[0]
	del param[0]
	if os.path.exists(outname):
	errfile.write("Cowardly refusing to overwrite: {}\n"
	"".format(outname))
	return False
	outfile = open(outname, "wb")
	else:
	filename = "/home/henrique/test.pdf"
	assert param == []
	#do_extract = True
	text, alt_text, _ = pdf_strings(filename, errfile)
	dump_text(outfile, errfile, text)
	if text != alt_text:
	errfile.write("Note: alt_text, {} byte(s)\n".format(len(alt_text)))
	return True


	def pdf_strings(filename, progress=None, do_extract=False):
	#Write a for-loop to open many files (leave a comment if you'd like to learn how).
	filename = "/home/henrique/test.pdf"
	#open allows you to read the file.
	p_file_obj = open(filename,'rb')
	# pdf_read variable is a readable object that will be parsed:
	pdf_read = PyPDF2.PdfFileReader(p_file_obj)
	#Discerning the number of pages will allow us to parse through all the pages.
	num_pages = pdf_read.numPages
	line = 0
	text = ""
	#The while loop will read each page.
	if progress:
	progress.write("Reading {} page(s)\n".format(num_pages))
	while True:
	page_obj = pdf_read.getPage(line)
	line +=1
	a_str = page_obj.extractText()
	if progress:
	progress.write("Reading page {} (text size: {})\n".format(line, len(a_str)))
	text += a_str
	if line >= num_pages:
	text += "\n"
	break
	text += "\n\n"
	# This if statement exists to check if the above library returned words.
	# It's done because PyPDF2 cannot read scanned files.
	if text == "" or do_extract:
	alt_text = textract.process(filename, method='tesseract', language='eng')
	# Now we have a text variable that contains all the text derived
	# from our PDF file. Type print(text) to see what it contains.
	# It likely contains a lot of spaces, possibly junk such as '\n,' etc.
	else:
	alt_text = text
	return text, alt_text, line


	def dump_text(outfile, errfile, text):
	utf_chr = False
	if not outfile:
	return -1
	for ch in text:
	bad = None
	if ch == chr(0x2022):
	ch = "(o)"
	try:
	data = bytes(ch.encode("iso-8859-1"))
	except UnicodeEncodeError:
	bad = ch
	if bad:
	msg = "[?]"
	if errfile:
	errfile.write("Cannot convert 0x{:4x}, assuming: {}\n".format(ord(ch), msg))
	data = msg.encode("ascii")
	try:
	outfile.write(data)
	except TypeError:
	utf_chr = True
	if utf_chr:
	outfile.write(data.decode("ascii", "ignore"))
	return utf_chr


	if __name__ == "__main__":
	main()