Skip to content

Instantly share code, notes, and snippets.

@xiaochun-z
Last active May 18, 2019 03:20
Show Gist options
  • Save xiaochun-z/738b5c73b270129774a91d15eaf8cf24 to your computer and use it in GitHub Desktop.
Save xiaochun-z/738b5c73b270129774a91d15eaf8cf24 to your computer and use it in GitHub Desktop.
remove some other colors for ocr, possible python package: imutils numpy opencv-python Pillow pytesseract
from PIL import Image
import pytesseract
import cv2
from os import listdir
from os.path import isfile, join
if __name__ == "__main__":
path = r'C:\Users\c\Downloads\ocr'
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
for f in onlyfiles:
file = join(path, f)
img = cv2.imread(file,0)
#ret , thresh1 = cv2.threshold(img,36,0,cv2.THRESH_BINARY)
# Run canny edge detection on each channel
#edges = cv2.Canny(img, 200, 250)
#new_image = edges.copy()
ret , new_image = cv2.threshold(img,170,255,cv2.THRESH_BINARY_INV)
new_image = cv2.blur(new_image, (3, 3))
#new_image = cv2.adaptiveThreshold(new_image,190,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
#new_image = cv2.bitwise_not(new_image);
file2 = join(path, 'out',f)
cv2.imwrite(file2, new_image)
text = pytesseract.image_to_string(new_image)
print(text)
#cv2.imshow(f, new_image)
#cv2.waitKey(0)
#break;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment