Skip to content

Instantly share code, notes, and snippets.

@ybv
Last active August 29, 2015 13:58
Show Gist options
  • Save ybv/10013075 to your computer and use it in GitHub Desktop.
Save ybv/10013075 to your computer and use it in GitHub Desktop.
Reverse soundex match. Trying to cut short each word in a given sentence till the extent that it shares the same soundex with the actual word. Totally naive and needs fixes.
import string
import fuzzy
import itertools
import random
import re
from itertools import groupby
table = string.maketrans("","")
soundex = fuzzy.Soundex(4)
dmetaphone = fuzzy.DMetaphone(4)
VOWELS = "aeiou"
RE_VOWEL = re.compile("[%s]" % VOWELS)
def split_str(str):
return str.split(" ");
def rem_punc(str):
return str.translate(table, string.punctuation)
def rem_consec_dupz(str):
unique = (i[0] for i in groupby(str))
return ''.join(unique)
def rem_viwels(word):#needs fix!
list =[]
for i in VOWELS:
list.append(word.translate(None, i))
return list
def rev_sound(s):
min =len(s);
for i in rem_viwels(s):
if soundex(i) == soundex(s):
if(len(i)<min):
min = len(i)
min_ind = i
return min_ind
sentence = "penalize"
dict = {}
def pre_proc_rules(str):
str = str.lower()
if re.search('@[a-z0-9_-]+', str, re.IGNORECASE):
dict[str]=str
elif re.search('http://', str, re.IGNORECASE):
dict[str]=str
elif re.search('https://', str, re.IGNORECASE):
dict[str]=str
else:
if(len(str)<=2):
dict[str]=str
else:
str_no_dupz = rem_consec_dupz(str)
dict[str]=rev_sound(str_no_dupz)
for s in split_str(sentence):
pre_proc_rules(s);
res=""
for s in split_str(sentence):
res += dict[s.lower()] + " "
print s.lower()
print res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment