Deepayan137/cost_new.py

## cost_new.py
import sys
import os
from ocr import GravesOCR
from postproc.dictionary import Dictionary
from parser import read_book
import json
from cost_model import CostModel
from timekeep import Timer
from parser.convert import page_to_unit
import parser.webtotrain as webtotrain
from parser.nlp import extract_words
from collections import Counter
import pdb


def cost_model(**kwargs):
    tc, sc = unit_typing_cost, unit_selection_cost
    method = kwargs['method']
    in_dictionary = kwargs['included']
    not_in_dictionary = kwargs['excluded']
    return tc*not_in_dictionary + sc*in_ictionary

def naive(errors):
    cost = cost_model(excluded= errors, included = None)
    return cost

def spell_check(correctable, uncorrectable):
    cost = cost_model(excluded = uncorrectable, included = correctable)
    return cost

def simulate(ocr, em, book_locs, book_index):
	book_path = book_locs.pop(book_index)
	full_text = '\n'.join(list(map(webtotrain.full_text, book_locs)))
	words = extract_words(full_text)
	em.enhance_vocab_with_books(words)
	timer.start("read images")
	pagewise = webtotrain.read_book(book_path)
	page_count = len(pagewise)
	images, truths = page_to_unit(pagewise)
	n_images = len(images)
	timer.start("ocr, recognize")
	predictions = [ocr.recognize(image) for image in images]
    errors = [em.error(prediction) for prediction in predictions]

    cost_naive = naive(errors)
    err_indices = [i for i,v enumerate(errors) if v != 0]
    correctable, uncorrectable = 0, 0
    vocab = []
    for index in error_indices:
        em.enhance_vocabulary(vocabulary)
        suggestions = em.suggest(predictions[index])
        if truths[index] in suggetions:
            corectable += 1
            vocab.append(truths[index])
        else:
            uncorrectable += 1

    cost_spellcheck = spell_check(correctable, uncorrectable)


if __name__ == '__main__':
	config = json.load(open(sys.argv[1]))
	book_index = int(sys.argv[2])
	lang = sys.argv[3]
	output_dir = 'postproc_analysis'
	ocr = GravesOCR(config["model"], config["lookup"])
	error = Dictionary(**config["error"])
	book_locs = list(map(lambda x: config["dir"] + x + '/', config["books"]))
	err_count, gt_count = simulate(ocr, error, book_locs, book_index)
	import sys
	import os
	from ocr import GravesOCR
	from postproc.dictionary import Dictionary
	from parser import read_book
	import json
	from cost_model import CostModel
	from timekeep import Timer
	from parser.convert import page_to_unit
	import parser.webtotrain as webtotrain
	from parser.nlp import extract_words
	from collections import Counter
	import pdb


	def cost_model(**kwargs):
	tc, sc = unit_typing_cost, unit_selection_cost
	method = kwargs['method']
	in_dictionary = kwargs['included']
	not_in_dictionary = kwargs['excluded']
	return tcnot_in_dictionary + scin_ictionary

	def naive(errors):
	cost = cost_model(excluded= errors, included = None)
	return cost

	def spell_check(correctable, uncorrectable):
	cost = cost_model(excluded = uncorrectable, included = correctable)
	return cost

	def simulate(ocr, em, book_locs, book_index):
	book_path = book_locs.pop(book_index)
	full_text = '\n'.join(list(map(webtotrain.full_text, book_locs)))
	words = extract_words(full_text)
	em.enhance_vocab_with_books(words)
	timer.start("read images")
	pagewise = webtotrain.read_book(book_path)
	page_count = len(pagewise)
	images, truths = page_to_unit(pagewise)
	n_images = len(images)
	timer.start("ocr, recognize")
	predictions = [ocr.recognize(image) for image in images]
	errors = [em.error(prediction) for prediction in predictions]

	cost_naive = naive(errors)
	err_indices = [i for i,v enumerate(errors) if v != 0]
	correctable, uncorrectable = 0, 0
	vocab = []
	for index in error_indices:
	em.enhance_vocabulary(vocabulary)
	suggestions = em.suggest(predictions[index])
	if truths[index] in suggetions:
	corectable += 1
	vocab.append(truths[index])
	else:
	uncorrectable += 1

	cost_spellcheck = spell_check(correctable, uncorrectable)



	if __name__ == '__main__':
	config = json.load(open(sys.argv[1]))
	book_index = int(sys.argv[2])
	lang = sys.argv[3]
	output_dir = 'postproc_analysis'
	ocr = GravesOCR(config["model"], config["lookup"])
	error = Dictionary(**config["error"])
	book_locs = list(map(lambda x: config["dir"] + x + '/', config["books"]))
	err_count, gt_count = simulate(ocr, error, book_locs, book_index)