kylegallatin/pdf2text.py

## pdf2text.py
from pdf2image import convert_from_path
import pytesseract

def pdf2txt(pdf_path, num_pages=10):
    ##use pdf2image to keep the pages in memory
    pages = convert_from_path(pdf_path, 500, first_page=0, last_page=num_pages)
    docs = []
    ## iterate through each page saving the text and page number
    for i,page in enumerate(pages):
        d = {}
        text = pytesseract.image_to_string(page)
        d['page'] = i+1 ## index from 0
        d['text'] = text
        ##create list of dictionaries
        docs.append(d)
    print("Done")
    return docs
	from pdf2image import convert_from_path
	import pytesseract

	def pdf2txt(pdf_path, num_pages=10):
	##use pdf2image to keep the pages in memory
	pages = convert_from_path(pdf_path, 500, first_page=0, last_page=num_pages)
	docs = []
	## iterate through each page saving the text and page number
	for i,page in enumerate(pages):
	d = {}
	text = pytesseract.image_to_string(page)
	d['page'] = i+1 ## index from 0
	d['text'] = text
	##create list of dictionaries
	docs.append(d)
	print("Done")
	return docs