dmarx/words_in_phone.py

## words_in_phone.py
import time # just to time the preprocessing step

index = {}
WORDS = set()

map=[' '                #0
    ,' '                #1
    ,('a','b','c')      #2
    ,('d','e','f')      #3
    ,('g','h','i')      #4
    ,('j','k','l')      #5
    ,('m','n','o')      #6
    ,('p','q','r','s')  #7
    ,('t','u','v')      #8
    ,('w','x','y','z')  #9
    ]

char_to_num = {}
for ix, chars in enumerate(map):
    for c in chars:
        char_to_num[c] = str(ix)

def text_to_phone(word):
    """Converts a word into its phone number representation"""
    phone = ''
    for c in word.lower():
        phone += char_to_num[c]
    return phone

def preprocess_words(wordlist, minlen=2, maxlen=10):
    """
    Adds wordlist elements (within minlen and maxlen threshholds) to a global index
    that maps a phone number (in string form) to all the words it corresponds to.
    """
    start = time.time()
    for w in wordlist:
        if len(w) > maxlen or minlen > len(w):
            continue
        word = w.lower()
        phone = text_to_phone(word)
        #print word, phone, index.get(phone, [])
        matches = index.get(phone, set())
        matches.add(word)
        index[phone] = matches

        global WORDS
        WORDS.add(phone)
    print "Elapsed time for preprocessing:", time.time() - start

def permutate_seq_substring(text):
    """Returns a set of unique sequential substrings."""
    # there's probably a better way to do this with itertools (islice?)
    n = len(text)
    permutations = set()
    for start in range(n):
        for end in range(n+1):
            permutations.add( text[start:end] )
    return permutations

def find_words_in_phone_number(phone):
    words = {}
    nums = [substr for substr in permutate_seq_substring(phone)]
    for num in nums:
        try:
            words[num] = index[num]
        except KeyError:
            continue
    return words

def interactive():
    print "type X to exit"
    while True:
        phone = raw_input("Enter phone number (no spaces):")
        if phone == 'X': break
        for k,v in find_words_in_phone_number(phone).iteritems():
            print k,'\t',list(v)

if __name__ == '__main__':
    # Load words from NLTK
    from nltk.corpus import words as words
    preprocess_words( words.words() )
    interactive()
	import time # just to time the preprocessing step

	index = {}
	WORDS = set()

	map=[' ' #0
	,' ' #1
	,('a','b','c') #2
	,('d','e','f') #3
	,('g','h','i') #4
	,('j','k','l') #5
	,('m','n','o') #6
	,('p','q','r','s') #7
	,('t','u','v') #8
	,('w','x','y','z') #9
	]

	char_to_num = {}
	for ix, chars in enumerate(map):
	for c in chars:
	char_to_num[c] = str(ix)

	def text_to_phone(word):
	"""Converts a word into its phone number representation"""
	phone = ''
	for c in word.lower():
	phone += char_to_num[c]
	return phone

	def preprocess_words(wordlist, minlen=2, maxlen=10):
	"""
	Adds wordlist elements (within minlen and maxlen threshholds) to a global index
	that maps a phone number (in string form) to all the words it corresponds to.
	"""
	start = time.time()
	for w in wordlist:
	if len(w) > maxlen or minlen > len(w):
	continue
	word = w.lower()
	phone = text_to_phone(word)
	#print word, phone, index.get(phone, [])
	matches = index.get(phone, set())
	matches.add(word)
	index[phone] = matches

	global WORDS
	WORDS.add(phone)
	print "Elapsed time for preprocessing:", time.time() - start

	def permutate_seq_substring(text):
	"""Returns a set of unique sequential substrings."""
	# there's probably a better way to do this with itertools (islice?)
	n = len(text)
	permutations = set()
	for start in range(n):
	for end in range(n+1):
	permutations.add( text[start:end] )
	return permutations

	def find_words_in_phone_number(phone):
	words = {}
	nums = [substr for substr in permutate_seq_substring(phone)]
	for num in nums:
	try:
	words[num] = index[num]
	except KeyError:
	continue
	return words

	def interactive():
	print "type X to exit"
	while True:
	phone = raw_input("Enter phone number (no spaces):")
	if phone == 'X': break
	for k,v in find_words_in_phone_number(phone).iteritems():
	print k,'\t',list(v)

	if __name__ == '__main__':
	# Load words from NLTK
	from nltk.corpus import words as words
	preprocess_words( words.words() )
	interactive()