Last active
December 16, 2015 01:49
-
-
Save dmarx/5358222 to your computer and use it in GitHub Desktop.
Implemented an algorithm I described in reference to a reddit post: http://www.reddit.com/r/learnprogramming/comments/1bqlj9/python_efficient_algorithm_for_checking_if_a/c9993nj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time # just to time the preprocessing step | |
index = {} | |
WORDS = set() | |
map=[' ' #0 | |
,' ' #1 | |
,('a','b','c') #2 | |
,('d','e','f') #3 | |
,('g','h','i') #4 | |
,('j','k','l') #5 | |
,('m','n','o') #6 | |
,('p','q','r','s') #7 | |
,('t','u','v') #8 | |
,('w','x','y','z') #9 | |
] | |
char_to_num = {} | |
for ix, chars in enumerate(map): | |
for c in chars: | |
char_to_num[c] = str(ix) | |
def text_to_phone(word): | |
"""Converts a word into its phone number representation""" | |
phone = '' | |
for c in word.lower(): | |
phone += char_to_num[c] | |
return phone | |
def preprocess_words(wordlist, minlen=2, maxlen=10): | |
""" | |
Adds wordlist elements (within minlen and maxlen threshholds) to a global index | |
that maps a phone number (in string form) to all the words it corresponds to. | |
""" | |
start = time.time() | |
for w in wordlist: | |
if len(w) > maxlen or minlen > len(w): | |
continue | |
word = w.lower() | |
phone = text_to_phone(word) | |
#print word, phone, index.get(phone, []) | |
matches = index.get(phone, set()) | |
matches.add(word) | |
index[phone] = matches | |
global WORDS | |
WORDS.add(phone) | |
print "Elapsed time for preprocessing:", time.time() - start | |
def permutate_seq_substring(text): | |
"""Returns a set of unique sequential substrings.""" | |
# there's probably a better way to do this with itertools (islice?) | |
n = len(text) | |
permutations = set() | |
for start in range(n): | |
for end in range(n+1): | |
permutations.add( text[start:end] ) | |
return permutations | |
def find_words_in_phone_number(phone): | |
words = {} | |
nums = [substr for substr in permutate_seq_substring(phone)] | |
for num in nums: | |
try: | |
words[num] = index[num] | |
except KeyError: | |
continue | |
return words | |
def interactive(): | |
print "type X to exit" | |
while True: | |
phone = raw_input("Enter phone number (no spaces):") | |
if phone == 'X': break | |
for k,v in find_words_in_phone_number(phone).iteritems(): | |
print k,'\t',list(v) | |
if __name__ == '__main__': | |
# Load words from NLTK | |
from nltk.corpus import words as words | |
preprocess_words( words.words() ) | |
interactive() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment