Skip to content

Instantly share code, notes, and snippets.

@Deepayan137
Created October 29, 2017 08:56
Show Gist options
  • Save Deepayan137/304086ccf6738c9ce3a4766d14635efa to your computer and use it in GitHub Desktop.
Save Deepayan137/304086ccf6738c9ce3a4766d14635efa to your computer and use it in GitHub Desktop.
compute costs for differnt methods
import sys
import os
from ocr import GravesOCR
from postproc.dictionary import Dictionary
from parser import read_book
import json
from cost_model import CostModel
from timekeep import Timer
from parser.convert import page_to_unit
import parser.webtotrain as webtotrain
from parser.nlp import extract_words
from collections import Counter
import pdb
def cost_model(**kwargs):
tc, sc = unit_typing_cost, unit_selection_cost
method = kwargs['method']
in_dictionary = kwargs['included']
not_in_dictionary = kwargs['excluded']
return tc*not_in_dictionary + sc*in_ictionary
def naive(errors):
cost = cost_model(excluded= errors, included = None)
return cost
def spell_check(correctable, uncorrectable):
cost = cost_model(excluded = uncorrectable, included = correctable)
return cost
def simulate(ocr, em, book_locs, book_index):
book_path = book_locs.pop(book_index)
full_text = '\n'.join(list(map(webtotrain.full_text, book_locs)))
words = extract_words(full_text)
em.enhance_vocab_with_books(words)
timer.start("read images")
pagewise = webtotrain.read_book(book_path)
page_count = len(pagewise)
images, truths = page_to_unit(pagewise)
n_images = len(images)
timer.start("ocr, recognize")
predictions = [ocr.recognize(image) for image in images]
errors = [em.error(prediction) for prediction in predictions]
cost_naive = naive(errors)
err_indices = [i for i,v enumerate(errors) if v != 0]
correctable, uncorrectable = 0, 0
vocab = []
for index in error_indices:
em.enhance_vocabulary(vocabulary)
suggestions = em.suggest(predictions[index])
if truths[index] in suggetions:
corectable += 1
vocab.append(truths[index])
else:
uncorrectable += 1
cost_spellcheck = spell_check(correctable, uncorrectable)
if __name__ == '__main__':
config = json.load(open(sys.argv[1]))
book_index = int(sys.argv[2])
lang = sys.argv[3]
output_dir = 'postproc_analysis'
ocr = GravesOCR(config["model"], config["lookup"])
error = Dictionary(**config["error"])
book_locs = list(map(lambda x: config["dir"] + x + '/', config["books"]))
err_count, gt_count = simulate(ocr, error, book_locs, book_index)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment