Skip to content

Instantly share code, notes, and snippets.

@dmarx
Last active December 16, 2015 01:49
Show Gist options
  • Save dmarx/5358222 to your computer and use it in GitHub Desktop.
Save dmarx/5358222 to your computer and use it in GitHub Desktop.
import time # just to time the preprocessing step
index = {}
WORDS = set()
map=[' ' #0
,' ' #1
,('a','b','c') #2
,('d','e','f') #3
,('g','h','i') #4
,('j','k','l') #5
,('m','n','o') #6
,('p','q','r','s') #7
,('t','u','v') #8
,('w','x','y','z') #9
]
char_to_num = {}
for ix, chars in enumerate(map):
for c in chars:
char_to_num[c] = str(ix)
def text_to_phone(word):
"""Converts a word into its phone number representation"""
phone = ''
for c in word.lower():
phone += char_to_num[c]
return phone
def preprocess_words(wordlist, minlen=2, maxlen=10):
"""
Adds wordlist elements (within minlen and maxlen threshholds) to a global index
that maps a phone number (in string form) to all the words it corresponds to.
"""
start = time.time()
for w in wordlist:
if len(w) > maxlen or minlen > len(w):
continue
word = w.lower()
phone = text_to_phone(word)
#print word, phone, index.get(phone, [])
matches = index.get(phone, set())
matches.add(word)
index[phone] = matches
global WORDS
WORDS.add(phone)
print "Elapsed time for preprocessing:", time.time() - start
def permutate_seq_substring(text):
"""Returns a set of unique sequential substrings."""
# there's probably a better way to do this with itertools (islice?)
n = len(text)
permutations = set()
for start in range(n):
for end in range(n+1):
permutations.add( text[start:end] )
return permutations
def find_words_in_phone_number(phone):
words = {}
nums = [substr for substr in permutate_seq_substring(phone)]
for num in nums:
try:
words[num] = index[num]
except KeyError:
continue
return words
def interactive():
print "type X to exit"
while True:
phone = raw_input("Enter phone number (no spaces):")
if phone == 'X': break
for k,v in find_words_in_phone_number(phone).iteritems():
print k,'\t',list(v)
if __name__ == '__main__':
# Load words from NLTK
from nltk.corpus import words as words
preprocess_words( words.words() )
interactive()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment