Skip to content

Instantly share code, notes, and snippets.

@rcsmit
Last active April 27, 2022 04:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rcsmit/39f15319b91c9b9f12b4fcbe0a61d560 to your computer and use it in GitHub Desktop.
Save rcsmit/39f15319b91c9b9f12b4fcbe0a61d560 to your computer and use it in GitHub Desktop.
################################################################
#
# Recognize text in all image files in a certain directory
# And write it to seperate or one textfile(s) or a SQLite3 database
#
#################################################################
# Make sure you installed Pytesseract : https://stackoverflow.com/questions/50951955/pytesseract-tesseractnotfound-error-tesseract-is-not-installed-or-its-not-i
# TODO : clear confusion betweend dir_name and rootdir
# add creation/modification date/time as field
# if there is an error, you have to start all over again with reading the directories
# Import libraries
from PIL import Image
import pytesseract
import os
import time
import sqlite3 as sl
import pandas as pd
def convert_txt_to_ocr(file_name):
"""Really recognize the text in the image to text
Args:
file_name (string): the filename
Returns:
text: the text in the image
"""
print(f"Converting {file_name}")
pytesseract.pytesseract.tesseract_cmd = (
r"C:\Program Files\Tesseract-OCR\tesseract.exe"
)
try:
text = str(((pytesseract.image_to_string(Image.open(file_name)))))
except:
text = (f"Error reading {file_name}")
#text = ""
return text
def txt_to_ocr(dir_name, item, extensionsToCheck):
"""See if the file is an image, check there is text in the image and save/append the text to a file"""
for e in extensionsToCheck:
if item.endswith(e):
file_name = dir_name + os.sep + item
print(f"Processing {file_name}")
text = convert_txt_to_ocr(file_name)
if len(text) > 0 :
return text
else:
print(f"No text found in {file_name}")
def save_text(text, filename, modus, dir_name, rootdir):
"""Save/append the text to a file"""
#print(f"Saving {filename}")
if modus == "new":
filename = filename.replace(".PNG", "_PNG")
filename = filename.replace(".png", "_png")
filename = filename.replace(".jpg", "_jpg")
filename = filename.replace(".jpeg", "_jpeg")
filename = filename.replace(".JPG", "_JPG")
filename_txt = filename + ".txt"
with open(filename_txt, "w") as f:
#print(f"Saving {filename}")
f.write(text)
elif modus == "append":
file_to_save = rootdir + os.sep + " text_in_images.txt"
with open(file_to_save, "a") as f:
print(f"Adding {filename} to textfile")
f.write(f"\n\n=== {dir_name} \ {filename} ===\n")
f.write(text)
else:
print("ERROR IN 'modus'")
def insert_text_to_database( con, directory, filename, text):
"""Insert the text in the database
"""
print(f"Writing {filename} to database")
sql = 'INSERT INTO txt_from_images( directory, filename, text_in_image) values(?,?, ?)'
data = [
(directory, filename,text )
]
with con:
con.executemany(sql, data)
def take_action_with_text(dir_name, modus, action, text, file_name,rootdir,con):
"""Save to textfile or to database?
"""
if text is not None:
if action == "save_to_database":
insert_text_to_database( con, dir_name, file_name, text)
elif action == "save_to_txt_file":
save_text(text, file_name, modus, dir_name, rootdir)
else:
print ("ERROR in 'action'")
def main():
######################################################################
dir_name = r"C:\Users\rcxsm\Pictures\ocr_test"
extensionsToCheck = [".jpg", ".JPG", ".jpeg", ".png", ".PNG"]
modus = "new" # "append" # 'new' to place text in seperate textfiles, ' append' to put all text in one file
to_do = "including_subdirectories"# "single_directory" #"including_subdirectories"
action = "save_to_database" #"save_to_txt_file" #"save_to_database"
#######################################################################
if action == "save_to_database":
db_name = dir_name + os.sep + "my_test.db"
con = sl.connect(db_name)
try:
with con:
con.execute("""
CREATE TABLE txt_from_images (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
directory TEXT,
filename TEXT,
text_in_image TEXT
);
""")
except:
print ("Table exists already")
else:
con = None
s1 = int(time.time())
if to_do == "single_directory":
rootdir = dir_name
print(f"Browsing {dir_name}")
lengte = len(os.listdir(dir_name))
n = 1
os.chdir(dir_name) # change directory from working dir to dir with files
for file in os.listdir(dir_name): # loop through items in dir
print(f"{n}/{lengte}")
text = txt_to_ocr(dir_name, file, extensionsToCheck)
take_action_with_text(dir_name, modus, action, text, file,rootdir,con)
n = n + 1
elif to_do == "including_subdirectories":
rootdir = dir_name
for subdir, dirs, files in os.walk(dir_name):
for file in files:
print(f"{subdir =}")
print(f"{file =}")
text = txt_to_ocr(subdir, file, extensionsToCheck)
take_action_with_text(subdir, modus, action, text, file,rootdir, con)
else:
print("ERROR IN 'to_do' variable")
s2 = int(time.time())
s2x = s2 - s1
print("Downloading took " + str(s2x) + " seconds ....")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment