christopherkullenberg/pdftotext2.py

## pdftotext2.py
from subprocess import Popen, PIPE, STDOUT
from nltk.tokenize import sent_tokenize #make sure to install the full corpus.
import re

aFile = '/home/christopher/Desktop/Introduction to Computation and Programming Using Python, Revised - Guttag, John V..pdf'

def pdftoText(filename):
    '''
    Input: a PDF file
    Output: output of pdftotext.
    '''
    p = Popen(['pdftotext', filename, "-"], shell=False, stdout=PIPE, stderr=STDOUT)
    content, err = p.communicate()
    return(content.decode('utf-8'))

def sentencetokenizer(text):
    tokens = sent_tokenize(text)
    return(tokens)

for sentence in (sentencetokenizer(pdftoText(aFile))):
    print("******************")
    print(sentence)
	from subprocess import Popen, PIPE, STDOUT
	from nltk.tokenize import sent_tokenize #make sure to install the full corpus.
	import re

	aFile = '/home/christopher/Desktop/Introduction to Computation and Programming Using Python, Revised - Guttag, John V..pdf'

	def pdftoText(filename):
	'''
	Input: a PDF file
	Output: output of pdftotext.
	'''
	p = Popen(['pdftotext', filename, "-"], shell=False, stdout=PIPE, stderr=STDOUT)
	content, err = p.communicate()
	return(content.decode('utf-8'))

	def sentencetokenizer(text):
	tokens = sent_tokenize(text)
	return(tokens)

	for sentence in (sentencetokenizer(pdftoText(aFile))):
	print("******************")
	print(sentence)