funderburkjim/vei-fuzzyalpha.py

## vei-fuzzyalpha.py
""" fuzzyalpha.py
  Apr 14, 2015 for VEI - applied to faultfinder
  Attempt to get spelling change suggestions for Sanskrit.
  Usage: python26 fuzzyalpha.py vei-only-notrxx-page.txt fuzzyalpha.txt ../../../../../awork/sanhw1/sanhw1.txt ../../veihw2.txt
  2nd usage

   python26 fuzzyalpha.py vei-nonverbs1.txt fuzzyalpha1.txt ../../../../../awork/sanhw1/sanhw1.txt ../../veihw2.txt

   input.txt is a list of headwords, one per line, in slp1 transliteration
    Note: This is specialized to vei-only-notrxx-ff.txt
   fuzzyalpha.txt is the output file name
    Hard-coded inputs:
     path to sanhw1.txt
     path to veihw2.txt
 Note: input.txt lines can also have the form:
 hw:other junk
"""
import re
import sys
import levenshtein
import string
tranfrom="aAiIuUfFxXeEoOMHkKgGNcCjJYwWqQRtTdDnpPbBmyrlvSzsh"
tranto = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvw"
trantable = string.maketrans(tranfrom,tranto)

def slp_cmp(a,b):
 a1 = string.translate(a,trantable)
 b1 = string.translate(b,trantable)
 return cmp(a1,b1)

class Sanhw(object):
    def __init__(self,line):
        line = line.rstrip('\r\n')
        self.line =line
        self.n = None # subscript . Filled in later
        (self.key,self.dictstr)=re.split(r':',line)
        self.dicts = re.split(',',self.dictstr)
        self.len = len(self.key) # Feb 3, 2015, used by suggest_v3
    def __repr__(self):
        return "Sanhw[%s]"%self.line
class Sanhws(object):
 def __init__(self,filename='../../../../awork/sanhw1/sanhw1.txt'):
  with open(filename,'r') as f:
   # the 'if (not...) clause is to skip ':AP90' in sanhw1.txt
   self.sanhws = [Sanhw(x) for x in f if (not x.startswith(':'))]
  self.sanhwsd = {}
  for sanhw in self.sanhws:
   self.sanhwsd[sanhw.key] = sanhw

class Dicthw(object):
    def __init__(self,line):
        line = line.rstrip('\r\n')
        self.line =line
        self.n = None # subscript . Filled in later
        (self.page,self.hw,self.lrange)=re.split(r':',line)
    def __repr__(self):
        return self.line
class Dicthws(object):
 def __init__(self,filename):
  with open(filename,'r') as f:
   self.dicthws = [Dicthw(x) for x in f]

def old_suggest_v3(w,sanhws,m=2,skipexact=True):
    # Assume first letter of 'w' is correct.
    # For efficiency, consider only sanhws that start with same letter
    # Do not return exact match
    w0 = w[0]
    hws=[x for x in sanhws if x.key[0] == w0]
    #print "%s headwords start with %s" %(len(hws),w0)
    nearlist=[] # list of hws whose levenshtein distance from w is <=
    low = 99
    for hw in hws:
        if (w == hw.key) and skipexact:
            continue
        d=levenshtein.levenshtein1(w,hw.key,m)
        if d == -1:
            continue
        nearlist.append((d,hw))
        if (d < low): # update low distance
            low = d
    # include ones only = low
    ans = [x[1] for x in nearlist if x[0] == low]

    #s = sorted(nearlist,key=lambda(x):x[0]) # sort by d
    return ans

def suggest_v3(w,sanhws,m=2,skipexact=True):
    # modified to screen further by length of word
    # Assume first letter of 'w' is correct.
    # For efficiency, consider only sanhws that start with same letter
    # Do not return exact match
    w0 = w[0]
    hws=[x for x in sanhws if x.key[0] == w0]
    # Feb 3, 2015
    lw = len(w)
    hws = [x for x in hws if (abs(x.len - lw) < m)] #? < m or > m ?
    #print "%s headwords start with %s" %(len(hws),w0)
    nearlist=[] # list of hws whose levenshtein distance from w is <=
    low = 99
    for hw in hws:
        if (w == hw.key) and skipexact:
            continue
        d=levenshtein.levenshtein1(w,hw.key,m)
        if d == -1:
            continue
        nearlist.append((d,hw))
        if (d < low): # update low distance
            low = d
    # include ones only = low
    ans = [x[1] for x in nearlist if x[0] == low]

    #s = sorted(nearlist,key=lambda(x):x[0]) # sort by d
    return ans

def process_alphaerr(i,dicthws,sanhws,sanhwsd,fout,n,pagecol):
 f = fout
 dashes=('-'*72)
 hw = dicthws[i].hw
 case='a'
 f.write('%s\n'%dashes)
 #f.write('%03d %s %s !< %s\n' %(n,case,dicthws[i-1].line,dicthws[i].line))
 suggestions = suggest_v3(hw,sanhws,6)
 print "dbg: hw %s has %s suggestions" %(hw,len(suggestions))
 #print i,len(dicthws),hw
 hw0 = dicthws[i-1].hw
 hw1 = dicthws[i+1].hw
 ordered=[[],[]]  # yes/no
 for suggestion in suggestions:
  hwsuggest = suggestion.key
  if (slp_cmp(hw0,hwsuggest) < 0) and (slp_cmp(hwsuggest,hw1) < 0):
   ordered[0].append(hwsuggest)
  else:
   ordered[1].append(hwsuggest)
 f.write('%03d %s\n'%(n,hw0))
 out=[', '.join(ordered[0]),','.join(ordered[1])]  #Apr 14, 2015 comma-space
 f.write('%03d %s -> %s (%s)\n' %(n,hw,out[0],out[1]))
 f.write('%03d %s\n'%(n,hw1))
 pagelink = page_link(pagecol) # 4th line requested by Dhaval
 hwlink = headword_link(hw)
 out =  '%03d headword %s ---  page %s' % (n,hwlink,pagelink)
 f.write('%s\n' % out)
 f.write('\n') # extra blank line
 return 0

def page_link(volpage):
 """ return 'href' string for link to scanned image for VEI for page 'page'
 """
 d = "VEI"
 y = "2014"
 base = "http://www.sanskrit-lexicon.uni-koeln.de/scans"
 url = "%s/%sScan/%s/web/webtc/servepdf.php" %(base,d,y)
 #(page,col) = re.split('-',volpage) #
 #pageparm = page
 pageparm = volpage
 parms = "page=%s" % pageparm
 href = "%s?%s" % (url,parms)
 ans = "<a target='_VEIpage' href='%s'>%s</a>" %(href,volpage)
 return ans

def headword_link(hw):
 """ return 'href' string for link to basic  display for pwg for headword hw
     Use this form, which GitHub accepts, so that link opens in same
     tab always
 """
 d = "VEI"
 y = "2014"
 base = "http://www.sanskrit-lexicon.uni-koeln.de/scans"
 url = "%s/%sScan/%s/web/webtc/indexcaller.php" %(base,d,y)
 parms = "input=slp1&output=deva&key=%s" % hw
 href = "%s?%s" % (url,parms)
 ans = "<a target='_VEIword' href='%s'>%s</a>" %(href,hw)
 return ans

def main():
 filename = sys.argv[1]
 fileout = sys.argv[2]

 file_sanhw1 = sys.argv[3] #'../../../../awork/sanhw1/sanhw1.txt'
 file_dicthw2 = sys.argv[4] # '../dicthw2.txt'
 # initialize from sanhw1.txt
 c = Sanhws(file_sanhw1)
 sanhws = c.sanhws
 sanhwsd = c.sanhwsd
 # initialize from dicthw2.txt
 c = Dicthws(file_dicthw2)
 dicthws = c.dicthws
 # dictionary for dicthws:
 #  Associate to each dicthws.hw
 # info message
 dicthwsd = {}
 ndicthws = len(dicthws)
 for i in xrange(0,ndicthws):
  dicthw = dicthws[i]
  hw = dicthw.hw
  if hw not in dicthwsd:
   dicthwsd[hw] = []
  dicthwsd[hw].append(i)
 print "%s records from %s" %(len(dicthws),file_dicthw2)
 print "%s records from %s" %(len(sanhws),file_sanhw1)
 f = open(filename,'r')
 fout = open(fileout,'w')
 # process dicthws line by line, looking for alphabetical misorderings
 notordered=0

 nline = 0
 mline = 10 # for debug
 mline = 1000000 # for production.
 for line in f:
  line = line.strip()
  # akznA:007-2
  parts = re.split(r':',line)
  (hw,volpage) = parts  # Apr 14, 2015
  #hw = parts[0]
  #hw = parts[1]  # chksort1.txt, angya
  nline = nline + 1
  if nline > mline:
   print "exiting after ",nline
   break
  if hw not in dicthwsd:
   print "word %s not a headword" % hw
   continue
  idicts = dicthwsd[hw]
  for idict in idicts:
   dicthw = dicthws[idict]
   icase=process_alphaerr(idict,dicthws,sanhws,sanhwsd,fout,nline,volpage)

 f.close()
 fout.close()
if __name__=="__main__":
 main()
	""" fuzzyalpha.py
	Apr 14, 2015 for VEI - applied to faultfinder
	Attempt to get spelling change suggestions for Sanskrit.
	Usage: python26 fuzzyalpha.py vei-only-notrxx-page.txt fuzzyalpha.txt ../../../../../awork/sanhw1/sanhw1.txt ../../veihw2.txt
	2nd usage

	python26 fuzzyalpha.py vei-nonverbs1.txt fuzzyalpha1.txt ../../../../../awork/sanhw1/sanhw1.txt ../../veihw2.txt

	input.txt is a list of headwords, one per line, in slp1 transliteration
	Note: This is specialized to vei-only-notrxx-ff.txt
	fuzzyalpha.txt is the output file name
	Hard-coded inputs:
	path to sanhw1.txt
	path to veihw2.txt
	Note: input.txt lines can also have the form:
	hw:other junk
	"""
	import re
	import sys
	import levenshtein
	import string
	tranfrom="aAiIuUfFxXeEoOMHkKgGNcCjJYwWqQRtTdDnpPbBmyrlvSzsh"
	tranto = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvw"
	trantable = string.maketrans(tranfrom,tranto)

	def slp_cmp(a,b):
	a1 = string.translate(a,trantable)
	b1 = string.translate(b,trantable)
	return cmp(a1,b1)

	class Sanhw(object):
	def __init__(self,line):
	line = line.rstrip('\r\n')
	self.line =line
	self.n = None # subscript . Filled in later
	(self.key,self.dictstr)=re.split(r':',line)
	self.dicts = re.split(',',self.dictstr)
	self.len = len(self.key) # Feb 3, 2015, used by suggest_v3
	def __repr__(self):
	return "Sanhw[%s]"%self.line
	class Sanhws(object):
	def __init__(self,filename='../../../../awork/sanhw1/sanhw1.txt'):
	with open(filename,'r') as f:
	# the 'if (not...) clause is to skip ':AP90' in sanhw1.txt
	self.sanhws = [Sanhw(x) for x in f if (not x.startswith(':'))]
	self.sanhwsd = {}
	for sanhw in self.sanhws:
	self.sanhwsd[sanhw.key] = sanhw

	class Dicthw(object):
	def __init__(self,line):
	line = line.rstrip('\r\n')
	self.line =line
	self.n = None # subscript . Filled in later
	(self.page,self.hw,self.lrange)=re.split(r':',line)
	def __repr__(self):
	return self.line
	class Dicthws(object):
	def __init__(self,filename):
	with open(filename,'r') as f:
	self.dicthws = [Dicthw(x) for x in f]

	def old_suggest_v3(w,sanhws,m=2,skipexact=True):
	# Assume first letter of 'w' is correct.
	# For efficiency, consider only sanhws that start with same letter
	# Do not return exact match
	w0 = w[0]
	hws=[x for x in sanhws if x.key[0] == w0]
	#print "%s headwords start with %s" %(len(hws),w0)
	nearlist=[] # list of hws whose levenshtein distance from w is <=
	low = 99
	for hw in hws:
	if (w == hw.key) and skipexact:
	continue
	d=levenshtein.levenshtein1(w,hw.key,m)
	if d == -1:
	continue
	nearlist.append((d,hw))
	if (d < low): # update low distance
	low = d
	# include ones only = low
	ans = [x[1] for x in nearlist if x[0] == low]

	#s = sorted(nearlist,key=lambda(x):x[0]) # sort by d
	return ans

	def suggest_v3(w,sanhws,m=2,skipexact=True):
	# modified to screen further by length of word
	# Assume first letter of 'w' is correct.
	# For efficiency, consider only sanhws that start with same letter
	# Do not return exact match
	w0 = w[0]
	hws=[x for x in sanhws if x.key[0] == w0]
	# Feb 3, 2015
	lw = len(w)
	hws = [x for x in hws if (abs(x.len - lw) < m)] #? < m or > m ?
	#print "%s headwords start with %s" %(len(hws),w0)
	nearlist=[] # list of hws whose levenshtein distance from w is <=
	low = 99
	for hw in hws:
	if (w == hw.key) and skipexact:
	continue
	d=levenshtein.levenshtein1(w,hw.key,m)
	if d == -1:
	continue
	nearlist.append((d,hw))
	if (d < low): # update low distance
	low = d
	# include ones only = low
	ans = [x[1] for x in nearlist if x[0] == low]

	#s = sorted(nearlist,key=lambda(x):x[0]) # sort by d
	return ans

	def process_alphaerr(i,dicthws,sanhws,sanhwsd,fout,n,pagecol):
	f = fout
	dashes=('-'*72)
	hw = dicthws[i].hw
	case='a'
	f.write('%s\n'%dashes)
	#f.write('%03d %s %s !< %s\n' %(n,case,dicthws[i-1].line,dicthws[i].line))
	suggestions = suggest_v3(hw,sanhws,6)
	print "dbg: hw %s has %s suggestions" %(hw,len(suggestions))
	#print i,len(dicthws),hw
	hw0 = dicthws[i-1].hw
	hw1 = dicthws[i+1].hw
	ordered=[[],[]] # yes/no
	for suggestion in suggestions:
	hwsuggest = suggestion.key
	if (slp_cmp(hw0,hwsuggest) < 0) and (slp_cmp(hwsuggest,hw1) < 0):
	ordered[0].append(hwsuggest)
	else:
	ordered[1].append(hwsuggest)
	f.write('%03d %s\n'%(n,hw0))
	out=[', '.join(ordered[0]),','.join(ordered[1])] #Apr 14, 2015 comma-space
	f.write('%03d %s -> %s (%s)\n' %(n,hw,out[0],out[1]))
	f.write('%03d %s\n'%(n,hw1))
	pagelink = page_link(pagecol) # 4th line requested by Dhaval
	hwlink = headword_link(hw)
	out = '%03d headword %s --- page %s' % (n,hwlink,pagelink)
	f.write('%s\n' % out)
	f.write('\n') # extra blank line
	return 0

	def page_link(volpage):
	""" return 'href' string for link to scanned image for VEI for page 'page'
	"""
	d = "VEI"
	y = "2014"
	base = "http://www.sanskrit-lexicon.uni-koeln.de/scans"
	url = "%s/%sScan/%s/web/webtc/servepdf.php" %(base,d,y)
	#(page,col) = re.split('-',volpage) #
	#pageparm = page
	pageparm = volpage
	parms = "page=%s" % pageparm
	href = "%s?%s" % (url,parms)
	ans = "<a target='_VEIpage' href='%s'>%s</a>" %(href,volpage)
	return ans

	def headword_link(hw):
	""" return 'href' string for link to basic display for pwg for headword hw
	Use this form, which GitHub accepts, so that link opens in same
	tab always
	"""
	d = "VEI"
	y = "2014"
	base = "http://www.sanskrit-lexicon.uni-koeln.de/scans"
	url = "%s/%sScan/%s/web/webtc/indexcaller.php" %(base,d,y)
	parms = "input=slp1&output=deva&key=%s" % hw
	href = "%s?%s" % (url,parms)
	ans = "<a target='_VEIword' href='%s'>%s</a>" %(href,hw)
	return ans

	def main():
	filename = sys.argv[1]
	fileout = sys.argv[2]

	file_sanhw1 = sys.argv[3] #'../../../../awork/sanhw1/sanhw1.txt'
	file_dicthw2 = sys.argv[4] # '../dicthw2.txt'
	# initialize from sanhw1.txt
	c = Sanhws(file_sanhw1)
	sanhws = c.sanhws
	sanhwsd = c.sanhwsd
	# initialize from dicthw2.txt
	c = Dicthws(file_dicthw2)
	dicthws = c.dicthws
	# dictionary for dicthws:
	# Associate to each dicthws.hw
	# info message
	dicthwsd = {}
	ndicthws = len(dicthws)
	for i in xrange(0,ndicthws):
	dicthw = dicthws[i]
	hw = dicthw.hw
	if hw not in dicthwsd:
	dicthwsd[hw] = []
	dicthwsd[hw].append(i)
	print "%s records from %s" %(len(dicthws),file_dicthw2)
	print "%s records from %s" %(len(sanhws),file_sanhw1)
	f = open(filename,'r')
	fout = open(fileout,'w')
	# process dicthws line by line, looking for alphabetical misorderings
	notordered=0

	nline = 0
	mline = 10 # for debug
	mline = 1000000 # for production.
	for line in f:
	line = line.strip()
	# akznA:007-2
	parts = re.split(r':',line)
	(hw,volpage) = parts # Apr 14, 2015
	#hw = parts[0]
	#hw = parts[1] # chksort1.txt, angya
	nline = nline + 1
	if nline > mline:
	print "exiting after ",nline
	break
	if hw not in dicthwsd:
	print "word %s not a headword" % hw
	continue
	idicts = dicthwsd[hw]
	for idict in idicts:
	dicthw = dicthws[idict]
	icase=process_alphaerr(idict,dicthws,sanhws,sanhwsd,fout,nline,volpage)

	f.close()
	fout.close()
	if __name__=="__main__":
	main()