Last active
November 22, 2015 23:07
-
-
Save almost/c785db8056e623e3ba4a to your computer and use it in GitHub Desktop.
Parse words from a string of smushed together words with a single regexp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Solving the same problem as | |
# http://blogs.perl.org/users/ingy_dot_net/2015/11/perl-regular-expression-awesomeness.html | |
# but using a Trie instead of regexps, without backtracking and in | |
# Python | |
from itertools import groupby | |
phrase = "yougotmailmanners" | |
def makeTrie(words, prefix=""): | |
node = {} | |
for firstLetter, wordGroup in groupby(words, lambda w: w[:1]): | |
if firstLetter == "": | |
node["WORD"] = prefix | |
else: | |
node[firstLetter] = makeTrie((w[1:] for w in wordGroup), prefix + firstLetter) | |
return node | |
# Build a Trie of all the words in the dict (excluding short words | |
# otherwise we'll get a lot more outputs!) | |
baseTrie = makeTrie(x.strip().lower() for x in open("/usr/share/dict/words", "r") if len(x.strip()) > 1) | |
def advance(l, (soFar, trie)): | |
if l not in trie: | |
return [] | |
trie = trie[l] | |
if "WORD" in trie: | |
return [(soFar + [trie["WORD"]], baseTrie), (soFar, trie)] | |
else: | |
return [(soFar, trie)] | |
# Maintain a list of pairs of words so far and pointer into the trie | |
# representing the valid interpretations of the input so far | |
pointers = [([], baseTrie)] | |
for letter in phrase: | |
pointers = [newP for oldP in pointers for newP in advance(letter, oldP)] | |
for words, trie in pointers: | |
# If the current trie is the base trie that means we ended on a full word | |
if trie == baseTrie: | |
print " ".join(words) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment