Skip to content

Instantly share code, notes, and snippets.

@JulianaGuama
Last active May 15, 2019 18:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JulianaGuama/7de0f401da3dffeb65dc319198d95a98 to your computer and use it in GitHub Desktop.
Save JulianaGuama/7de0f401da3dffeb65dc319198d95a98 to your computer and use it in GitHub Desktop.
Scrap de um pdf-imagem com tesseract - janela deslizante
#imports
from PIL import Image
import pytesseract as ptr
import cv2
TESSDATA_PREFIX = r'C:/Users/your-user/AppData/Local/Tesseract-OCR'
ptr.pytesseract.tesseract_cmd = r"C:\Users\your-user\AppData\Local\Tesseract-OCR\tesseract.exe"
filename = r'C:/Users/your-user/fileLocal/file.jpg'
pdf = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
#pdf = cv2.imread(filename, 0) #ambas as formas geram o mesmo resultado
#shape[0] é o tamanho em relação ao eixo x
for x in range(0, pdf.shape[0]):
xlim = x + xbox
if(xlim >= pdf.shape[0]):
break
#shape[1] é o tamanho em relação ao eixo y
for y in range(0, pdf.shape[1]):
ylim = y + ybox
if (ylim >= pdf.shape[1]):
break
pdfNF = ptr.image_to_string(pdf[y:ylim][x:xlim], lang='por')
print(pdfNF)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment