Created
April 17, 2015 02:26
-
-
Save funderburkjim/9527f5d14a006b227ec4 to your computer and use it in GitHub Desktop.
fuzzyalpha program
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" fuzzyalpha.py | |
Apr 14, 2015 for VEI - applied to faultfinder | |
Attempt to get spelling change suggestions for Sanskrit. | |
Usage: python26 fuzzyalpha.py vei-only-notrxx-page.txt fuzzyalpha.txt ../../../../../awork/sanhw1/sanhw1.txt ../../veihw2.txt | |
2nd usage | |
python26 fuzzyalpha.py vei-nonverbs1.txt fuzzyalpha1.txt ../../../../../awork/sanhw1/sanhw1.txt ../../veihw2.txt | |
input.txt is a list of headwords, one per line, in slp1 transliteration | |
Note: This is specialized to vei-only-notrxx-ff.txt | |
fuzzyalpha.txt is the output file name | |
Hard-coded inputs: | |
path to sanhw1.txt | |
path to veihw2.txt | |
Note: input.txt lines can also have the form: | |
hw:other junk | |
""" | |
import re | |
import sys | |
import levenshtein | |
import string | |
tranfrom="aAiIuUfFxXeEoOMHkKgGNcCjJYwWqQRtTdDnpPbBmyrlvSzsh" | |
tranto = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvw" | |
trantable = string.maketrans(tranfrom,tranto) | |
def slp_cmp(a,b): | |
a1 = string.translate(a,trantable) | |
b1 = string.translate(b,trantable) | |
return cmp(a1,b1) | |
class Sanhw(object): | |
def __init__(self,line): | |
line = line.rstrip('\r\n') | |
self.line =line | |
self.n = None # subscript . Filled in later | |
(self.key,self.dictstr)=re.split(r':',line) | |
self.dicts = re.split(',',self.dictstr) | |
self.len = len(self.key) # Feb 3, 2015, used by suggest_v3 | |
def __repr__(self): | |
return "Sanhw[%s]"%self.line | |
class Sanhws(object): | |
def __init__(self,filename='../../../../awork/sanhw1/sanhw1.txt'): | |
with open(filename,'r') as f: | |
# the 'if (not...) clause is to skip ':AP90' in sanhw1.txt | |
self.sanhws = [Sanhw(x) for x in f if (not x.startswith(':'))] | |
self.sanhwsd = {} | |
for sanhw in self.sanhws: | |
self.sanhwsd[sanhw.key] = sanhw | |
class Dicthw(object): | |
def __init__(self,line): | |
line = line.rstrip('\r\n') | |
self.line =line | |
self.n = None # subscript . Filled in later | |
(self.page,self.hw,self.lrange)=re.split(r':',line) | |
def __repr__(self): | |
return self.line | |
class Dicthws(object): | |
def __init__(self,filename): | |
with open(filename,'r') as f: | |
self.dicthws = [Dicthw(x) for x in f] | |
def old_suggest_v3(w,sanhws,m=2,skipexact=True): | |
# Assume first letter of 'w' is correct. | |
# For efficiency, consider only sanhws that start with same letter | |
# Do not return exact match | |
w0 = w[0] | |
hws=[x for x in sanhws if x.key[0] == w0] | |
#print "%s headwords start with %s" %(len(hws),w0) | |
nearlist=[] # list of hws whose levenshtein distance from w is <= | |
low = 99 | |
for hw in hws: | |
if (w == hw.key) and skipexact: | |
continue | |
d=levenshtein.levenshtein1(w,hw.key,m) | |
if d == -1: | |
continue | |
nearlist.append((d,hw)) | |
if (d < low): # update low distance | |
low = d | |
# include ones only = low | |
ans = [x[1] for x in nearlist if x[0] == low] | |
#s = sorted(nearlist,key=lambda(x):x[0]) # sort by d | |
return ans | |
def suggest_v3(w,sanhws,m=2,skipexact=True): | |
# modified to screen further by length of word | |
# Assume first letter of 'w' is correct. | |
# For efficiency, consider only sanhws that start with same letter | |
# Do not return exact match | |
w0 = w[0] | |
hws=[x for x in sanhws if x.key[0] == w0] | |
# Feb 3, 2015 | |
lw = len(w) | |
hws = [x for x in hws if (abs(x.len - lw) < m)] #? < m or > m ? | |
#print "%s headwords start with %s" %(len(hws),w0) | |
nearlist=[] # list of hws whose levenshtein distance from w is <= | |
low = 99 | |
for hw in hws: | |
if (w == hw.key) and skipexact: | |
continue | |
d=levenshtein.levenshtein1(w,hw.key,m) | |
if d == -1: | |
continue | |
nearlist.append((d,hw)) | |
if (d < low): # update low distance | |
low = d | |
# include ones only = low | |
ans = [x[1] for x in nearlist if x[0] == low] | |
#s = sorted(nearlist,key=lambda(x):x[0]) # sort by d | |
return ans | |
def process_alphaerr(i,dicthws,sanhws,sanhwsd,fout,n,pagecol): | |
f = fout | |
dashes=('-'*72) | |
hw = dicthws[i].hw | |
case='a' | |
f.write('%s\n'%dashes) | |
#f.write('%03d %s %s !< %s\n' %(n,case,dicthws[i-1].line,dicthws[i].line)) | |
suggestions = suggest_v3(hw,sanhws,6) | |
print "dbg: hw %s has %s suggestions" %(hw,len(suggestions)) | |
#print i,len(dicthws),hw | |
hw0 = dicthws[i-1].hw | |
hw1 = dicthws[i+1].hw | |
ordered=[[],[]] # yes/no | |
for suggestion in suggestions: | |
hwsuggest = suggestion.key | |
if (slp_cmp(hw0,hwsuggest) < 0) and (slp_cmp(hwsuggest,hw1) < 0): | |
ordered[0].append(hwsuggest) | |
else: | |
ordered[1].append(hwsuggest) | |
f.write('%03d %s\n'%(n,hw0)) | |
out=[', '.join(ordered[0]),','.join(ordered[1])] #Apr 14, 2015 comma-space | |
f.write('%03d %s -> %s (%s)\n' %(n,hw,out[0],out[1])) | |
f.write('%03d %s\n'%(n,hw1)) | |
pagelink = page_link(pagecol) # 4th line requested by Dhaval | |
hwlink = headword_link(hw) | |
out = '%03d headword %s --- page %s' % (n,hwlink,pagelink) | |
f.write('%s\n' % out) | |
f.write('\n') # extra blank line | |
return 0 | |
def page_link(volpage): | |
""" return 'href' string for link to scanned image for VEI for page 'page' | |
""" | |
d = "VEI" | |
y = "2014" | |
base = "http://www.sanskrit-lexicon.uni-koeln.de/scans" | |
url = "%s/%sScan/%s/web/webtc/servepdf.php" %(base,d,y) | |
#(page,col) = re.split('-',volpage) # | |
#pageparm = page | |
pageparm = volpage | |
parms = "page=%s" % pageparm | |
href = "%s?%s" % (url,parms) | |
ans = "<a target='_VEIpage' href='%s'>%s</a>" %(href,volpage) | |
return ans | |
def headword_link(hw): | |
""" return 'href' string for link to basic display for pwg for headword hw | |
Use this form, which GitHub accepts, so that link opens in same | |
tab always | |
""" | |
d = "VEI" | |
y = "2014" | |
base = "http://www.sanskrit-lexicon.uni-koeln.de/scans" | |
url = "%s/%sScan/%s/web/webtc/indexcaller.php" %(base,d,y) | |
parms = "input=slp1&output=deva&key=%s" % hw | |
href = "%s?%s" % (url,parms) | |
ans = "<a target='_VEIword' href='%s'>%s</a>" %(href,hw) | |
return ans | |
def main(): | |
filename = sys.argv[1] | |
fileout = sys.argv[2] | |
file_sanhw1 = sys.argv[3] #'../../../../awork/sanhw1/sanhw1.txt' | |
file_dicthw2 = sys.argv[4] # '../dicthw2.txt' | |
# initialize from sanhw1.txt | |
c = Sanhws(file_sanhw1) | |
sanhws = c.sanhws | |
sanhwsd = c.sanhwsd | |
# initialize from dicthw2.txt | |
c = Dicthws(file_dicthw2) | |
dicthws = c.dicthws | |
# dictionary for dicthws: | |
# Associate to each dicthws.hw | |
# info message | |
dicthwsd = {} | |
ndicthws = len(dicthws) | |
for i in xrange(0,ndicthws): | |
dicthw = dicthws[i] | |
hw = dicthw.hw | |
if hw not in dicthwsd: | |
dicthwsd[hw] = [] | |
dicthwsd[hw].append(i) | |
print "%s records from %s" %(len(dicthws),file_dicthw2) | |
print "%s records from %s" %(len(sanhws),file_sanhw1) | |
f = open(filename,'r') | |
fout = open(fileout,'w') | |
# process dicthws line by line, looking for alphabetical misorderings | |
notordered=0 | |
nline = 0 | |
mline = 10 # for debug | |
mline = 1000000 # for production. | |
for line in f: | |
line = line.strip() | |
# akznA:007-2 | |
parts = re.split(r':',line) | |
(hw,volpage) = parts # Apr 14, 2015 | |
#hw = parts[0] | |
#hw = parts[1] # chksort1.txt, angya | |
nline = nline + 1 | |
if nline > mline: | |
print "exiting after ",nline | |
break | |
if hw not in dicthwsd: | |
print "word %s not a headword" % hw | |
continue | |
idicts = dicthwsd[hw] | |
for idict in idicts: | |
dicthw = dicthws[idict] | |
icase=process_alphaerr(idict,dicthws,sanhws,sanhwsd,fout,nline,volpage) | |
f.close() | |
fout.close() | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment