Created
July 2, 2020 12:10
-
-
Save tvdsluijs/25a8cb7e5c291650c0fefe4bdac6cea9 to your computer and use it in GitHub Desktop.
Get text from Images with python and pytesseract
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import pytesseract | |
from pathlib import Path | |
from glob import glob | |
from os.path import join | |
class ImageOcr: | |
def __init__(self, my_path: Path = None): | |
if my_path is None: | |
print('Error no Path') | |
self.my_path = my_path | |
self.items = [] | |
self.filename = "ocr.txt" | |
self.create_file() | |
def process_images(self): | |
files = [] | |
for ext in ('*.gif', '*.png', '*.jpg'): | |
files.extend(glob(join(self.my_path, ext))) | |
for file in files: | |
self.write_to_file(pytesseract.image_to_string(file)) | |
def create_file(self): | |
file = open(self.filename, "w") | |
def write_to_file(self, my_str: str = ""): | |
with open("ocr.txt", "a") as file: | |
file.write(my_str) | |
if __name__ == "__main__": | |
my_path = Path(sys.argv[1]) | |
iocr = ImageOcr(my_path=my_path) | |
iocr.process_images() | |
# python img_ocr.py c:\path_to_my_images |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment