Skip to content

Instantly share code, notes, and snippets.

@KoStard
Created February 7, 2021 19:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KoStard/ccc41211615ea89b172c182976660861 to your computer and use it in GitHub Desktop.
Save KoStard/ccc41211615ea89b172c182976660861 to your computer and use it in GitHub Desktop.
OCR Armenian text from PDF and generate PDF from resullts
"""
python3 convert.py input_file.pdf output_file.pdf
"""
import pdf2image
import io
from PyPDF2 import PdfFileReader, PdfFileWriter
import sys
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
def pdf_to_img(pdf_file_path):
return pdf2image.convert_from_path(pdf_file_path)
def ocr_core(file):
pdf = pytesseract.image_to_pdf_or_hocr(file, lang='hye')
return pdf
def print_pages(pdf_file_path, output_stream):
print("Getting images")
images = pdf_to_img(pdf_file_path)
writer = PdfFileWriter()
for pg, img in enumerate(images):
print(f"Processing page {pg}")
pdf = ocr_core(img)
reader = PdfFileReader(io.BytesIO(pdf))
page = reader.getPage(0)
writer.addPage(page)
writer.write(output_stream)
with open(sys.argv[2], 'wb') as f:
print_pages(sys.argv[1], f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment