rlmacsween/form_baseline.py

## form_baseline.py
from collections import Counter

import pandas as pd

df = pd.read_hdf('training.h5')
g = df.groupby('slug')


def get_sample(slug):
    return df.ix[g.groups[slug]]


def true_total(sample):
    return sample.ix[sample.gross_amount.idxmax()].token


def parse_dollar_amount(s):
    if not s.startswith('$'):
        return None

    result = []
    for c in s:
        if c.isnumeric() or c == '.':
            result.append(c)

    try:
        return float(''.join(result))
    except ValueError:
        return None


def parse_non_dollar(s):
    if not s or not s[0].isnumeric() or any(c in s for c in '/-%'):
        return None

    result = []
    for c in s:
        if c.isnumeric() or c == '.':
            result.append(c)

    try:
        return float(''.join(result))
    except ValueError:
        return None


alignment_threshold = 2


def score(x, sample, dollar_counts, non_dollar_counts):
    slug, page, x0, y0, x1, y1, token, _ = x
    result = parse_dollar_amount(token)
    if result is not None:
        multiple = dollar_counts.get(token) > 1
        return (2, int(multiple), int('.' in token), result)

    result = parse_non_dollar(token)
    if result is not None:
        multiple = non_dollar_counts.get(token) > 1
        return (1, int(multiple), int('.' in token), result)
    else:
        return (0, 0, -1)


def argmax(L):
    """
    Needed because we can't make a numpy array of tuples.
    """
    m = max(L)
    for i in range(len(L)):
        if L[i] == m:
            return i


def predict_total_amount(sample):
    values = sample.get_values()
    values = []
    dollar_tokens = []
    non_dollar_tokens = []
    for v in sample.get_values():
        s = v[-2]
        if parse_dollar_amount(s) is not None:
            values.append(v)
            dollar_tokens.append(s)
        elif parse_non_dollar(s) is not None:
            values.append(v)
            non_dollar_tokens.append(s)

    dollar_counts = Counter(dollar_tokens)
    non_dollar_counts = Counter(non_dollar_tokens)
    scores = [score(x, sample, dollar_counts, non_dollar_counts) for x in values]
    return values[argmax(scores)][-2]


if __name__ == '__main__':
    match_count = 0
    slugs = df.slug.unique()
    length = len(slugs)
    for slug in slugs:
        sample = get_sample(slug)
        estimate = predict_total_amount(sample)
        true = true_total(sample)
        if estimate != true:
            print(slug)
            print(estimate, true)
            print()
        if estimate == true:
            match_count += 1

    print('Accuracy: ', 100 * match_count / length)
	from collections import Counter

	import pandas as pd

	df = pd.read_hdf('training.h5')
	g = df.groupby('slug')


	def get_sample(slug):
	return df.ix[g.groups[slug]]


	def true_total(sample):
	return sample.ix[sample.gross_amount.idxmax()].token


	def parse_dollar_amount(s):
	if not s.startswith('$'):
	return None

	result = []
	for c in s:
	if c.isnumeric() or c == '.':
	result.append(c)

	try:
	return float(''.join(result))
	except ValueError:
	return None


	def parse_non_dollar(s):
	if not s or not s[0].isnumeric() or any(c in s for c in '/-%'):
	return None

	result = []
	for c in s:
	if c.isnumeric() or c == '.':
	result.append(c)

	try:
	return float(''.join(result))
	except ValueError:
	return None


	alignment_threshold = 2


	def score(x, sample, dollar_counts, non_dollar_counts):
	slug, page, x0, y0, x1, y1, token, _ = x
	result = parse_dollar_amount(token)
	if result is not None:
	multiple = dollar_counts.get(token) > 1
	return (2, int(multiple), int('.' in token), result)

	result = parse_non_dollar(token)
	if result is not None:
	multiple = non_dollar_counts.get(token) > 1
	return (1, int(multiple), int('.' in token), result)
	else:
	return (0, 0, -1)


	def argmax(L):
	"""
	Needed because we can't make a numpy array of tuples.
	"""
	m = max(L)
	for i in range(len(L)):
	if L[i] == m:
	return i


	def predict_total_amount(sample):
	values = sample.get_values()
	values = []
	dollar_tokens = []
	non_dollar_tokens = []
	for v in sample.get_values():
	s = v[-2]
	if parse_dollar_amount(s) is not None:
	values.append(v)
	dollar_tokens.append(s)
	elif parse_non_dollar(s) is not None:
	values.append(v)
	non_dollar_tokens.append(s)

	dollar_counts = Counter(dollar_tokens)
	non_dollar_counts = Counter(non_dollar_tokens)
	scores = [score(x, sample, dollar_counts, non_dollar_counts) for x in values]
	return values[argmax(scores)][-2]


	if __name__ == '__main__':
	match_count = 0
	slugs = df.slug.unique()
	length = len(slugs)
	for slug in slugs:
	sample = get_sample(slug)
	estimate = predict_total_amount(sample)
	true = true_total(sample)
	if estimate != true:
	print(slug)
	print(estimate, true)
	print()
	if estimate == true:
	match_count += 1

	print('Accuracy: ', 100 * match_count / length)