Skip to content

Instantly share code, notes, and snippets.

@shion24hub
Last active November 22, 2023 13:19
Show Gist options
  • Save shion24hub/aeb77742a061d4777c7f071c269d610b to your computer and use it in GitHub Desktop.
Save shion24hub/aeb77742a061d4777c7f071c269d610b to your computer and use it in GitHub Desktop.
Elaborator for OCRed strings
from ocre import OCRElaborator
if __name__ == '__main__':
img_paths = ['test.png', 'test2.png']
elaborator = OCRElaborator(img_paths)
print(elaborator.run())
'''
Use Tesseract as an OCR engine.
'''
import pytesseract
class OCRElaborator:
# TODO: allows user to specify methods to be applied.
available_methods = [
'delete_duplicated_CR',
'cancel_hyphenation',
]
def __init__(self, img_paths: list[str]) -> None:
self.img_paths = img_paths
@staticmethod
def __ocr(img_path: str) -> str:
return pytesseract.image_to_string(img_path, lang='eng')
@staticmethod
def __delete_duplicated_CR(sentence: str) -> str:
proced_sentence = ''
for i in range(len(sentence)):
if sentence[i-1] == '\n' and sentence[i] == '\n':
continue
else:
proced_sentence += sentence[i]
return proced_sentence
@staticmethod
def __cancel_hyphenation(sentence: str) -> str:
proced_sentence = ''
for i in range(len(sentence)):
if sentence[i-1] == '-' and sentence[i] == '\n':
proced_sentence = proced_sentence[:-1]
elif sentence[i] == '\n':
proced_sentence += ' '
else:
proced_sentence += sentence[i]
return proced_sentence
@staticmethod
def __link(sentences: str) -> str:
return ' '.join(sentences)
def run(self) -> str:
proced_sentences = []
for img_path in self.img_paths:
sentence = self.__ocr(img_path)
sentence = self.__delete_duplicated_CR(sentence)
sentence = self.__cancel_hyphenation(sentence)
proced_sentences.append(sentence)
return self.__link(proced_sentences)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment