joelverhagen/hangman.py

## hangman.py
# digraphs, sorted by how common they are in the english language
digraphs = ['th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd', 'st', 'or', 'te', 'es', 'is', 'ha', 'ou', 'it', 'to', 'ed', 'ti', 'ng', 'ar', 'se', 'al', 'nt', 'as', 'le', 've', 'of', 'me', 'hi', 'ea', 'ne', 'de', 'co', 'ro', 'll', 'ri', 'li', 'ra', 'io', 'be', 'el', 'ch', 'ic', 'ce', 'ta', 'ma', 'ur', 'om', 'ho', 'et', 'no', 'ut', 'si', 'ca', 'la', 'il', 'fo', 'us', 'pe', 'ot', 'ec', 'lo', 'di', 'ns', 'ge', 'ly', 'ac', 'wi', 'wh', 'tr', 'ee', 'so', 'un', 'rs', 'wa', 'ow', 'id', 'ad', 'ai', 'ss', 'pr', 'ct', 'we', 'mo', 'ol', 'em', 'nc', 'rt', 'sh', 'po', 'ie', 'ul', 'im', 'ts', 'am', 'ir', 'yo', 'fi', 'os', 'pa', 'ni', 'ld', 'sa', 'ay', 'ke', 'mi', 'na', 'oo', 'su', 'do', 'ig', 'ev', 'gh', 'bl', 'if', 'tu', 'av', 'pl', 'wo', 'ry', 'bu', 'iv', 'ab', 'ia', 'vi', 'ex', 'op', 'bo', 'fe', 'ag', 'ci', 'da', 'mp', 'tt', 'sp', 'ck', 'ty', 'fr', 'ei', 'ap', 'rd', 'gr', 'od', 'ef', 'go', 'ba', 'ey', 'cl', 'cr', 'ov', 'ht', 'rn', 'fa', 'ls', 'gi', 'sc', 'up', 'cu', 'ue', 'ep', 'ga', 'ak', 'va', 'ff', 'uc', 'ki', 'by', 'qu', 'ew', 'ug', 'au', 'rr', 'rm', 'ds', 'oc', 'um', 'og', 'pp', 'ru', 'pi', 'rc', 'lu', 'oi', 'tl', 'my', 'ye', 'ua', 'eg', 'mu', 'dr', 'lt', 'ny', 'bi', 'pu', 'br', 'mb', 'ob', 'pt', 'ft', 'ui', 'ys', 'ub', 'ud', 'hr', 'rg', 'du', 'fu', 'rl', 'ok', 'nk', 'ms', 'wn', 'mm', 'eo', 'nu', 'ib', 'rk', 'hu', 'af', 'nl', 'nn', 'vo', 'cc', 'ik', 'tw', 'gu', 'aw', 'xt', 'ph', 'sm', 'ip', 'lf', 'dd', 'kn', 'gs', 'fl', 'iz', 'oa', 'ju', 'ks', 'gl', 'nf', 'ps', 'ze', 'xp', 'sl', 'rv', 'gn', 'sk', 'eq', 'dy', 'tc', 'nv', 'hy', 'sy', 'dl', 'bs', 'je', 'jo', 'ws', 'oe', 'mr', 'gg', 'eb', 'yi', 'sw', 'rp', 'wr', 'cy', 'rf', 'xi', 'ja', 'xa', 'oy', 'tm', 'lv', 'yp', 'dg', 'cs', 'lp', 'lm', 'eu', 'ox', 'eh', 'xc', 'ka', 'yt', 'nm', 'ek', 'ax', 'lk', 'ym', 'sn', 'ae', 'rb', 'uf', 'tp', 'ya', 'ix', 'za', 'dn', 'bj', 'dv', 'gy', 'tf', 'ah', 'hs', 'xe', 'ko', 'py', 'gt', 'az', 'dm', 'rh', 'sd', 'oh', 'bt', 'wl', 'lw', 'hm', 'lc', 'rw', 'hn', 'kl', 'yl', 'lr', 'bb', 'tn', 'zi', 'yb', 'np', 'pm', 'aq', 'hl', 'gm', 'nh', 'xy', 'ln', 'cp', 'fs', 'yc', 'sf', 'fy', 'yn', 'iu', 'dt', 'bc', 'td', 'mn', 'ku', 'sr', 'uo', 'ml', 'tb', 'nj', 'cm', 'ky', 'aj', 'zo', 'db', 'uy', 'ww', 'dw', 'pc', 'ii', 'nw', 'nr', 'oj', 'ao', 'sq', 'sb', 'iq', 'yr', 'mg', 'sg', 'pd', 'dc', 'nb', 'mt', 'cd', 'lg', 'vp', 'df', 'hb', 'yw', 'oz', 'pv', 'ez', 'mc', 'lb', 'hd', 'nq', 'tg', 'wt', 'kh', 'dp', 'tz', 'mv', 'wd', 'zz', 'fg', 'fc', 'zu', 'yd', 'xu', 'cq', 'ej', 'bv', 'vy', 'kg', 'cg', 'md', 'hw', 'mf', 'tv', 'ji', 'uz', 'gc', 'vn', 'wy', 'qi', 'tx', 'dh', 'ih', 'uk', 'kr', 'bm', 'aa', 'wp', 'fn', 'yg', 'kb', 'pg', 'cn', 'xh', 'zy', 'qw', 'wx', 'xx', 'gb', 'fd', 'sz', 'yu', 'xo', 'ux', 'gd', 'hk', 'gf', 'nx', 'bd', 'nz', 'kf', 'wm', 'ij', 'wf', 'jp', 'kw', 'hf', 'xs', 'hp', 'vs', 'sv', 'hc', 'pf', 'wc', 'dj', 'kt', 'dk', 'fh', 'uv', 'uh', 'bh', 'xf', 'yz', 'pk', 'kp', 'zl', 'bn', 'vu', 'bg', 'fp', 'wb', 'wk', 'cf', 'fx', 'fb', 'dx', 'xm', 'xn', 'lh', 'qa', 'vt', 'zh', 'wu', 'cb', 'yh', 'gp', 'jm', 'pb', 'fm', 'pw', 'fw', 'bw', 'vd', 'km', 'kk', 'iy', 'yf', 'xv', 'xb', 'kd', 'mw', 'jb', 'bp', 'rx', 'gw', 'ql', 'rq', 'xd', 'rz', 'xl', 'jl', 'vl', 'js', 'uu', 'tj', 'qq', 'vv', 'jt', 'lq', 'yv', 'hg', 'pn', 'hq', 'tk', 'rj', 'hv', 'cx', 'oq', 'hh', 'mh', 'lx', 'jf', 'gv', 'vr', 'qr', 'cz', 'gk', 'vh', 'sx', 'jc', 'kc', 'cv', 'bk', 'bf', 'qn', 'iw', 'dq', 'zn', 'bx', 'xr', 'vc', 'gz', 'qs', 'zs', 'jr', 'zw', 'zb', 'fk', 'dz', 'gx', 'jd', 'yk', 'vm', 'vb', 'qe', 'vk', 'cw', 'zt', 'fv', 'mx', 'vg', 'lz', 'yy', 'zc', 'zg', 'zm', 'lj', 'px', 'wg', 'sj', 'xq', 'mk', 'uj', 'yj', 'xg', 'zj', 'yx', 'uq', 'pz', 'xw', 'jk', 'cj', 'bz', 'qc', 'zk', 'kv', 'mj', 'tq', 'jh', 'jn', 'fz', 'zd', 'kj', 'wv', 'vx', 'zv', 'fq', 'kq', 'uw', 'pq', 'zx', 'zf', 'vw', 'xk', 'zp', 'xj', 'vj', 'jj', 'qt', 'qz', 'xz', 'pj', 'hj', 'bq', 'mq', 'qd', 'qv', 'jz', 'jq', 'jy', 'jx', 'kx', 'qm', 'vf', 'qo', 'hz', 'zq', 'fj', 'zr', 'jv', 'wz', 'yq', 'wq', 'jg', 'gj', 'wj', 'qb', 'gq', 'jw', 'mz', 'qy', 'kz', 'hx']

alphabet = set(map(chr, range(97, 97 + 26)))

# the letters I guessed
word = [None, 'l', 'e', None, 't', 'o', 'r']

# the letters I guess that are wrong
failed_letters = set(['c', 's', 'a', 'f'])

# set logics
success_letters = set(word) - set([None])
used_letters = failed_letters | success_letters
unused_letters = alphabet - used_letters

matches = {}

# CHECK ALL THE DIGRAPHS!
letter_limit = 1000

for i in range(len(word)):
    letter = word[i]
    if letter is not None:
        continue

    # MESS
    if i > 0:
        prev = word[i - 1]
        for digraph in digraphs:
            if digraph[0] == prev:
                match = digraph[1]
                if match in unused_letters:
                    if i not in matches:
                        matches[i] = [match]
                    elif len(matches[i]) < letter_limit:
                        matches[i].append(match)

    # ANOTHER MESS
    if i < len(word) - 1:
        next = word[i + 1]
        for digraph in digraphs:
            if digraph[1] == next:
                match = digraph[0]
                if match in unused_letters:
                    if i not in matches:
                        matches[i] = [match]
                    elif len(matches[i]) < letter_limit:
                        matches[i].append(match)

import itertools
keys = sorted(matches.keys())
candidates = set()

# fill all of the empty spots with all possibilities
for blanks in itertools.product(*list(map(lambda key: matches[key], keys))):
    test_word = list(word)
    for i in range(len(keys)):
        test_word[keys[i]] = blanks[i]

    candidates.add(''.join(test_word))

# function to see how many hits a given search returns on Amazon
import requests
import lxml.html
import re
def get_amazon_hit_count(search):
    response = requests.get('http://www.amazon.com/s/', params={
        'field-keywords': search
    }, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4'
    })

    html = lxml.html.fromstring(response.text)
    span = html.xpath('.//h2[@id = "resultCount"]/span')
    if len(span) > 0:
        count_text = span[0].text.strip()

        m = re.search(r'^Showing (\d+) - (\d+) of (?P<result_count>\d+) Results$', count_text)
        if m is not None:
            return int(m.group('result_count'))

        m = re.search(r'^Showing (?P<result_count>\d+) Results?', count_text)
        if m is not None:
            return int(m.group('result_count'))

        raise Exception('Unexpected result count text: ' + count_text)

    return 0

# try every candidate.
hits = []
for c in candidates:
    count = get_amazon_hit_count(c)
    # print('Found %s with %d hits.' % (c, count))
    if count > 0:
        hits.append((c, count))

print('Likely hits:')
hits = sorted(hits, key=lambda hit: hit[1], reversed=True)
for hit in hits:
    print('- %s, with %d hits.' % hit)

## output.txt
Likely hits:
- plextor, with 195 hits.
- kleitor, with 10 hits.
- blentor, with 1 hits.
- ilektor, with 1 hits.
	# digraphs, sorted by how common they are in the english language
	digraphs = ['th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd', 'st', 'or', 'te', 'es', 'is', 'ha', 'ou', 'it', 'to', 'ed', 'ti', 'ng', 'ar', 'se', 'al', 'nt', 'as', 'le', 've', 'of', 'me', 'hi', 'ea', 'ne', 'de', 'co', 'ro', 'll', 'ri', 'li', 'ra', 'io', 'be', 'el', 'ch', 'ic', 'ce', 'ta', 'ma', 'ur', 'om', 'ho', 'et', 'no', 'ut', 'si', 'ca', 'la', 'il', 'fo', 'us', 'pe', 'ot', 'ec', 'lo', 'di', 'ns', 'ge', 'ly', 'ac', 'wi', 'wh', 'tr', 'ee', 'so', 'un', 'rs', 'wa', 'ow', 'id', 'ad', 'ai', 'ss', 'pr', 'ct', 'we', 'mo', 'ol', 'em', 'nc', 'rt', 'sh', 'po', 'ie', 'ul', 'im', 'ts', 'am', 'ir', 'yo', 'fi', 'os', 'pa', 'ni', 'ld', 'sa', 'ay', 'ke', 'mi', 'na', 'oo', 'su', 'do', 'ig', 'ev', 'gh', 'bl', 'if', 'tu', 'av', 'pl', 'wo', 'ry', 'bu', 'iv', 'ab', 'ia', 'vi', 'ex', 'op', 'bo', 'fe', 'ag', 'ci', 'da', 'mp', 'tt', 'sp', 'ck', 'ty', 'fr', 'ei', 'ap', 'rd', 'gr', 'od', 'ef', 'go', 'ba', 'ey', 'cl', 'cr', 'ov', 'ht', 'rn', 'fa', 'ls', 'gi', 'sc', 'up', 'cu', 'ue', 'ep', 'ga', 'ak', 'va', 'ff', 'uc', 'ki', 'by', 'qu', 'ew', 'ug', 'au', 'rr', 'rm', 'ds', 'oc', 'um', 'og', 'pp', 'ru', 'pi', 'rc', 'lu', 'oi', 'tl', 'my', 'ye', 'ua', 'eg', 'mu', 'dr', 'lt', 'ny', 'bi', 'pu', 'br', 'mb', 'ob', 'pt', 'ft', 'ui', 'ys', 'ub', 'ud', 'hr', 'rg', 'du', 'fu', 'rl', 'ok', 'nk', 'ms', 'wn', 'mm', 'eo', 'nu', 'ib', 'rk', 'hu', 'af', 'nl', 'nn', 'vo', 'cc', 'ik', 'tw', 'gu', 'aw', 'xt', 'ph', 'sm', 'ip', 'lf', 'dd', 'kn', 'gs', 'fl', 'iz', 'oa', 'ju', 'ks', 'gl', 'nf', 'ps', 'ze', 'xp', 'sl', 'rv', 'gn', 'sk', 'eq', 'dy', 'tc', 'nv', 'hy', 'sy', 'dl', 'bs', 'je', 'jo', 'ws', 'oe', 'mr', 'gg', 'eb', 'yi', 'sw', 'rp', 'wr', 'cy', 'rf', 'xi', 'ja', 'xa', 'oy', 'tm', 'lv', 'yp', 'dg', 'cs', 'lp', 'lm', 'eu', 'ox', 'eh', 'xc', 'ka', 'yt', 'nm', 'ek', 'ax', 'lk', 'ym', 'sn', 'ae', 'rb', 'uf', 'tp', 'ya', 'ix', 'za', 'dn', 'bj', 'dv', 'gy', 'tf', 'ah', 'hs', 'xe', 'ko', 'py', 'gt', 'az', 'dm', 'rh', 'sd', 'oh', 'bt', 'wl', 'lw', 'hm', 'lc', 'rw', 'hn', 'kl', 'yl', 'lr', 'bb', 'tn', 'zi', 'yb', 'np', 'pm', 'aq', 'hl', 'gm', 'nh', 'xy', 'ln', 'cp', 'fs', 'yc', 'sf', 'fy', 'yn', 'iu', 'dt', 'bc', 'td', 'mn', 'ku', 'sr', 'uo', 'ml', 'tb', 'nj', 'cm', 'ky', 'aj', 'zo', 'db', 'uy', 'ww', 'dw', 'pc', 'ii', 'nw', 'nr', 'oj', 'ao', 'sq', 'sb', 'iq', 'yr', 'mg', 'sg', 'pd', 'dc', 'nb', 'mt', 'cd', 'lg', 'vp', 'df', 'hb', 'yw', 'oz', 'pv', 'ez', 'mc', 'lb', 'hd', 'nq', 'tg', 'wt', 'kh', 'dp', 'tz', 'mv', 'wd', 'zz', 'fg', 'fc', 'zu', 'yd', 'xu', 'cq', 'ej', 'bv', 'vy', 'kg', 'cg', 'md', 'hw', 'mf', 'tv', 'ji', 'uz', 'gc', 'vn', 'wy', 'qi', 'tx', 'dh', 'ih', 'uk', 'kr', 'bm', 'aa', 'wp', 'fn', 'yg', 'kb', 'pg', 'cn', 'xh', 'zy', 'qw', 'wx', 'xx', 'gb', 'fd', 'sz', 'yu', 'xo', 'ux', 'gd', 'hk', 'gf', 'nx', 'bd', 'nz', 'kf', 'wm', 'ij', 'wf', 'jp', 'kw', 'hf', 'xs', 'hp', 'vs', 'sv', 'hc', 'pf', 'wc', 'dj', 'kt', 'dk', 'fh', 'uv', 'uh', 'bh', 'xf', 'yz', 'pk', 'kp', 'zl', 'bn', 'vu', 'bg', 'fp', 'wb', 'wk', 'cf', 'fx', 'fb', 'dx', 'xm', 'xn', 'lh', 'qa', 'vt', 'zh', 'wu', 'cb', 'yh', 'gp', 'jm', 'pb', 'fm', 'pw', 'fw', 'bw', 'vd', 'km', 'kk', 'iy', 'yf', 'xv', 'xb', 'kd', 'mw', 'jb', 'bp', 'rx', 'gw', 'ql', 'rq', 'xd', 'rz', 'xl', 'jl', 'vl', 'js', 'uu', 'tj', 'qq', 'vv', 'jt', 'lq', 'yv', 'hg', 'pn', 'hq', 'tk', 'rj', 'hv', 'cx', 'oq', 'hh', 'mh', 'lx', 'jf', 'gv', 'vr', 'qr', 'cz', 'gk', 'vh', 'sx', 'jc', 'kc', 'cv', 'bk', 'bf', 'qn', 'iw', 'dq', 'zn', 'bx', 'xr', 'vc', 'gz', 'qs', 'zs', 'jr', 'zw', 'zb', 'fk', 'dz', 'gx', 'jd', 'yk', 'vm', 'vb', 'qe', 'vk', 'cw', 'zt', 'fv', 'mx', 'vg', 'lz', 'yy', 'zc', 'zg', 'zm', 'lj', 'px', 'wg', 'sj', 'xq', 'mk', 'uj', 'yj', 'xg', 'zj', 'yx', 'uq', 'pz', 'xw', 'jk', 'cj', 'bz', 'qc', 'zk', 'kv', 'mj', 'tq', 'jh', 'jn', 'fz', 'zd', 'kj', 'wv', 'vx', 'zv', 'fq', 'kq', 'uw', 'pq', 'zx', 'zf', 'vw', 'xk', 'zp', 'xj', 'vj', 'jj', 'qt', 'qz', 'xz', 'pj', 'hj', 'bq', 'mq', 'qd', 'qv', 'jz', 'jq', 'jy', 'jx', 'kx', 'qm', 'vf', 'qo', 'hz', 'zq', 'fj', 'zr', 'jv', 'wz', 'yq', 'wq', 'jg', 'gj', 'wj', 'qb', 'gq', 'jw', 'mz', 'qy', 'kz', 'hx']

	alphabet = set(map(chr, range(97, 97 + 26)))

	# the letters I guessed
	word = [None, 'l', 'e', None, 't', 'o', 'r']

	# the letters I guess that are wrong
	failed_letters = set(['c', 's', 'a', 'f'])

	# set logics
	success_letters = set(word) - set([None])
	used_letters = failed_letters \| success_letters
	unused_letters = alphabet - used_letters

	matches = {}

	# CHECK ALL THE DIGRAPHS!
	letter_limit = 1000

	for i in range(len(word)):
	letter = word[i]
	if letter is not None:
	continue

	# MESS
	if i > 0:
	prev = word[i - 1]
	for digraph in digraphs:
	if digraph[0] == prev:
	match = digraph[1]
	if match in unused_letters:
	if i not in matches:
	matches[i] = [match]
	elif len(matches[i]) < letter_limit:
	matches[i].append(match)

	# ANOTHER MESS
	if i < len(word) - 1:
	next = word[i + 1]
	for digraph in digraphs:
	if digraph[1] == next:
	match = digraph[0]
	if match in unused_letters:
	if i not in matches:
	matches[i] = [match]
	elif len(matches[i]) < letter_limit:
	matches[i].append(match)

	import itertools
	keys = sorted(matches.keys())
	candidates = set()

	# fill all of the empty spots with all possibilities
	for blanks in itertools.product(*list(map(lambda key: matches[key], keys))):
	test_word = list(word)
	for i in range(len(keys)):
	test_word[keys[i]] = blanks[i]

	candidates.add(''.join(test_word))

	# function to see how many hits a given search returns on Amazon
	import requests
	import lxml.html
	import re
	def get_amazon_hit_count(search):
	response = requests.get('http://www.amazon.com/s/', params={
	'field-keywords': search
	}, headers={
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4'
	})

	html = lxml.html.fromstring(response.text)
	span = html.xpath('.//h2[@id = "resultCount"]/span')
	if len(span) > 0:
	count_text = span[0].text.strip()

	m = re.search(r'^Showing (\d+) - (\d+) of (?P<result_count>\d+) Results$', count_text)
	if m is not None:
	return int(m.group('result_count'))

	m = re.search(r'^Showing (?P<result_count>\d+) Results?', count_text)
	if m is not None:
	return int(m.group('result_count'))

	raise Exception('Unexpected result count text: ' + count_text)

	return 0

	# try every candidate.
	hits = []
	for c in candidates:
	count = get_amazon_hit_count(c)
	# print('Found %s with %d hits.' % (c, count))
	if count > 0:
	hits.append((c, count))

	print('Likely hits:')
	hits = sorted(hits, key=lambda hit: hit[1], reversed=True)
	for hit in hits:
	print('- %s, with %d hits.' % hit)
	Likely hits:
	- plextor, with 195 hits.
	- kleitor, with 10 hits.
	- blentor, with 1 hits.
	- ilektor, with 1 hits.