Skip to content

Instantly share code, notes, and snippets.

@mikeboers
Last active February 9, 2016 07:36
Show Gist options
  • Save mikeboers/8277319 to your computer and use it in GitHub Desktop.
Save mikeboers/8277319 to your computer and use it in GitHub Desktop.
Solving a wordsearch by mining Wikipedia.
LAHLERIRAHLEDDIKWKT
CNANOCIESRRAUNAFAOG
WKRGGVREGAASNSESLRB
TETEUAEGNLWOGNTKDET
IUHLUHTCITDSERIAAAA
RECKSUPERHEROEWSRRC
EATCTVNCEANRNETGTKK
NBAOAENNHRFAPMAKHEY
IKGPREDATORTARRLVRR
LSYSKCRVFHNSYYIAATU
BKGYERFFOJTEACASDRB
OTANDROIDENLYTKEEAD
GHXENOMORPHTDYBRRTA
TYOGSOTHOTHTWRIBUSR
YTOBORRRLSTARWARSLB
VAJLBRRULILBORIGKLL
TDRAZIWSAKTDIREWOLF
AROFWOTLESRAASEMINA
FGILEANRLFNATACOREI
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-w', '--words', action='append')
parser.add_argument('puzzle')
args = parser.parse_args()
puzzle = [list(line.strip()) for line in open(args.puzzle).read().strip().split()]
rows = len(puzzle)
cols = len(puzzle[0])
print 'Puzzle is', rows, 'by', cols
dictionary = {}
FIN = '*'
def add_word_to_dictionary(word, node=dictionary):
node = node.setdefault(word[0], {})
remaining = word[1:]
if remaining:
add_word_to_dictionary(remaining, node)
else:
node[FIN] = node.get(FIN, 0) + 1
word_count = 0
for path in args.words or ['/usr/share/dict/words']:
print 'Loading', path
for line in open(path):
line = line.strip()
if line:
add_word_to_dictionary(line.upper())
word_count += 1
print word_count, 'words'
directions = [
('N' , 0, -1),
('NE', 1, -1),
('E' , 1, 0),
('SE', 1, 1),
('S' , 0, 1),
('SW', -1, 1),
('W' , -1, 0),
('NW', -1, -1),
]
for sx in xrange(cols):
for sy in xrange(rows):
for direction, dx, dy in directions:
solutions = []
node = dictionary
x = sx
y = sy
word = ''
while (
node and
x >= 0 and x < cols and
y >= 0 and y < rows
):
if False and len(word) > 2:
print ' %2d,%2d %2s: %s' % (sx, sy, direction, word)
char = puzzle[y][x]
word += char
node = node.get(char)
if node and FIN in node:
solutions.append(word)
x += dx
y += dy
for word in sorted(solutions, key=len, reverse=True):
if len(word) > 3:
print '* %2d,%2d %2s: %s' % (sx, sy, direction, word)
import argparse
import json
import os
import re
import requests
base_url = 'http://en.wikipedia.org/w/api.php'
base_params = dict(
format='json',
action='query',
titles='Main Page',
prop='revisions',
rvprop='content',
)
def get_content(title):
path = os.path.join('wp_data', title +'.json')
if not os.path.exists(path):
params = base_params.copy()
params['titles'] = title
res = requests.get(base_url, params=params)
with open(path, 'w') as fh:
fh.write(res.text)
data = json.load(open(path))
try:
return data['query']['pages'].values()[0]['revisions'][0]['*']
except (KeyError, IndexError) as e:
# This only happens on "file" pages, which we don't care about.
return ''
parser = argparse.ArgumentParser()
parser.add_argument('title', nargs='+')
parser.add_argument('-d', '--depth', type=int, default=0)
parser.add_argument('-o', '--output', nargs='?')
args = parser.parse_args()
def walk(title, depth=0):
content = get_content(title)
yield title, content
if depth <= 0:
return
for m in re.finditer(r'\[\[(.+?)(?:\||\]\])', content):
subtitle = m.group(1)
for x in walk(subtitle, depth - 1):
yield x
output = open(args.output, 'w') if args.output else None
seen = set()
for starting_title in args.title:
for title, content in walk(starting_title, args.depth):
print title, len(content)
if output:
for word in re.findall(r'\b[a-z]+\b', content):
if word not in seen:
output.write(word + '\n')
seen.add(word)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment