Skip to content

Instantly share code, notes, and snippets.

@kylegallatin
Last active March 23, 2020 15:49
Show Gist options
  • Save kylegallatin/bb914c7a320651d938fe97cf17ca74a0 to your computer and use it in GitHub Desktop.
Save kylegallatin/bb914c7a320651d938fe97cf17ca74a0 to your computer and use it in GitHub Desktop.
from pdf2image import convert_from_path
import pytesseract
def pdf2txt(pdf_path, num_pages=10):
##use pdf2image to keep the pages in memory
pages = convert_from_path(pdf_path, 500, first_page=0, last_page=num_pages)
docs = []
## iterate through each page saving the text and page number
for i,page in enumerate(pages):
d = {}
text = pytesseract.image_to_string(page)
d['page'] = i+1 ## index from 0
d['text'] = text
##create list of dictionaries
docs.append(d)
print("Done")
return docs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment