Created
October 29, 2017 08:56
-
-
Save Deepayan137/304086ccf6738c9ce3a4766d14635efa to your computer and use it in GitHub Desktop.
compute costs for differnt methods
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from ocr import GravesOCR | |
from postproc.dictionary import Dictionary | |
from parser import read_book | |
import json | |
from cost_model import CostModel | |
from timekeep import Timer | |
from parser.convert import page_to_unit | |
import parser.webtotrain as webtotrain | |
from parser.nlp import extract_words | |
from collections import Counter | |
import pdb | |
def cost_model(**kwargs): | |
tc, sc = unit_typing_cost, unit_selection_cost | |
method = kwargs['method'] | |
in_dictionary = kwargs['included'] | |
not_in_dictionary = kwargs['excluded'] | |
return tc*not_in_dictionary + sc*in_ictionary | |
def naive(errors): | |
cost = cost_model(excluded= errors, included = None) | |
return cost | |
def spell_check(correctable, uncorrectable): | |
cost = cost_model(excluded = uncorrectable, included = correctable) | |
return cost | |
def simulate(ocr, em, book_locs, book_index): | |
book_path = book_locs.pop(book_index) | |
full_text = '\n'.join(list(map(webtotrain.full_text, book_locs))) | |
words = extract_words(full_text) | |
em.enhance_vocab_with_books(words) | |
timer.start("read images") | |
pagewise = webtotrain.read_book(book_path) | |
page_count = len(pagewise) | |
images, truths = page_to_unit(pagewise) | |
n_images = len(images) | |
timer.start("ocr, recognize") | |
predictions = [ocr.recognize(image) for image in images] | |
errors = [em.error(prediction) for prediction in predictions] | |
cost_naive = naive(errors) | |
err_indices = [i for i,v enumerate(errors) if v != 0] | |
correctable, uncorrectable = 0, 0 | |
vocab = [] | |
for index in error_indices: | |
em.enhance_vocabulary(vocabulary) | |
suggestions = em.suggest(predictions[index]) | |
if truths[index] in suggetions: | |
corectable += 1 | |
vocab.append(truths[index]) | |
else: | |
uncorrectable += 1 | |
cost_spellcheck = spell_check(correctable, uncorrectable) | |
if __name__ == '__main__': | |
config = json.load(open(sys.argv[1])) | |
book_index = int(sys.argv[2]) | |
lang = sys.argv[3] | |
output_dir = 'postproc_analysis' | |
ocr = GravesOCR(config["model"], config["lookup"]) | |
error = Dictionary(**config["error"]) | |
book_locs = list(map(lambda x: config["dir"] + x + '/', config["books"])) | |
err_count, gt_count = simulate(ocr, error, book_locs, book_index) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment