aso2101/check-by-sorting.py

## check-by-sorting.py
# Usage:
#
# python3 check.py INPUT_FILE
#
#   where INPUT_FILE is just the name of the file to check.
#
#   This scripts expects that INPUT_FILE will contain one of
#   the following three strings:
#      kittel = Kittel's Kannada-English dictionary
#      ghatage = Ghatage's Prakrit-English dictionary
#      dasa = Dasa's Hindi-Hindi dictionary
#
#   It also consults a list of exceptions in a separate file
#   called:
#      exceptions-kittel.txt
#      exceptions-dasa.txt
#
# Comments on alphabetical order:
#  (1) Dasa sorts (e.g.) aṣṭāṃgī before aṣṭākapāla. Hence ṁ is considered before ALL other consonants.
#  (2) Dasa seems to include vowels WITH anusvāra or candrabindu before vowels WITHOUT anusvāra or
#        candrabindu. This results in some false negatives, but I am not clever enough to sort them
#        out.
#  (3) Kittel will sort words etymologically containing ṟ after words etymologically containing r
#        even if they are written with the same letter in the Kannada script due to sandhi
#        (e.g., akkarte [< akkaṟ] comes after akkaṟe). I don't know how to resolve this easily.
#
# Other comments:
#   The transliteration library (sanscript.py) has been modified because SLP1 does
#   not support the letters used for Hindi and Kannada. Please use the accompanying
#   version of sanscript.py rather than the centrally-distributed one.

import sys, re, sanscript
from os.path import exists

def load_exceptions(dictionary):
    exceptions = []
    if exists("exceptions"+dictionary+".txt"):
        with open("exceptions-"+dictionary+".txt","r") as i:
            for w in i.read().splitlines():
                exceptions.append(w.strip())
    return exceptions

def normalize(word,dictionary):
    new = ""
    stoplist = "-.()?"
    source_script = "devanagari"
    if dictionary == "kittel":
        source_script = "kannada"
    for c in sanscript.transliterate(word,source_script,"slp1"):
        if not c in stoplist:
            if c in alphabet:
                new = new + c
    if dictionary == "ghatage" or dictionary == "kittel":
        new = re.sub(r"M([kKgG]+)",r"N\1",new)
        new = re.sub(r"M([cCjJ]+)",r"Y\1",new)
        new = re.sub(r"M([wWqQ]+)",r"R\1",new)
        new = re.sub(r"M([tTdD]+)",r"n\1",new)
        new = re.sub(r"M([pPbb]+)",r"m\1",new)
    return new

if __name__ == "__main__":
    if len(sys.argv) > 1:
        inputfile = sys.argv[1]
        with open(inputfile,"r") as textfile:
            text = textfile.read()
            page = ""
            col = ""
            thisentry = ""
            preventry = ""
            prevpreventry = ""
            hw = r'.*<hw0>(.*)</hw0>.*'
            dictionary = "ghatage"
            source_script = "devanagari"
            alphabet = "aAiIuUfFxXeEoOMkKgGNcCjJYwWqQRtTdDnpPbBmyrlvSzshH"
            if "kittel" in inputfile:
                dictionary = "kittel"
                alphabet = "aAiIuUfFxX1eE2oOMkKgGNcCjJYw3W4qQRtTdDnpPbBmyrVlvSzshHLZ"
                source_script = "kannada"
            elif "dasa" in inputfile:
                dictionary = "dasa"
                alphabet = "aAiIuUfFxXeEoOM~kKgGNcCjJYwWq3Q4RtTdDnpPbBmyrlvSzshH"
            else: # otherwise the dictionary is ghatage
                hw = r'.*<hw>(.*)</hw>.*'
            exceptions = load_exceptions(dictionary)
            for line in text.splitlines():
                pb = re.match(r'.*<pb n="(.*?)"/>.*',line)
                cb = re.match(r'.*<col n="(.*?)"/>.*',line)
                hwre = re.match(hw,line)
                if pb:
                    page = int(pb.group(1))
                if cb:
                    col = int(cb.group(1))
                if hwre:
                    thisentry = hwre.group(1)
                    if thisentry not in exceptions:
                        if preventry != "":
                            if prevpreventry != "":
                                frame = [prevpreventry, thisentry]
                                framesorted = sorted(frame, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)])
                                if frame == framesorted:
                                    prior = [prevpreventry, preventry, thisentry]
                                    priorsorted = sorted(prior, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)])
                                    if prior != priorsorted:
                                        error = preventry+" ("+sanscript.transliterate(preventry,source_script,"iso")+")"
                                        previous = sanscript.transliterate(prevpreventry,source_script,"iso")
                                        following = sanscript.transliterate(thisentry,source_script,"iso")
                                        print("Page "+str(page)+", column "+str(col)+": "+error+" is in the wrong alphabetical order (previous is "+previous+", following is "+following+").")
                            prevpreventry = preventry
                        preventry = thisentry
            lastset = [prevpreventry, preventry]
            lastsetsorted = sorted(lastset, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)])
            if lastset != lastsetsorted:
                error = preventry+" ("+sanscript.transliterate(preventry,source_script,"iso")+")"
                print("Page "+str(page)+", column "+str(col)+": "+error+" is in the wrong alphabetical order.")
    else:
        print("No input file specified. Usage: check.py INPUTFILE.")
	# Usage:
	#
	# python3 check.py INPUT_FILE
	#
	# where INPUT_FILE is just the name of the file to check.
	#
	# This scripts expects that INPUT_FILE will contain one of
	# the following three strings:
	# kittel = Kittel's Kannada-English dictionary
	# ghatage = Ghatage's Prakrit-English dictionary
	# dasa = Dasa's Hindi-Hindi dictionary
	#
	# It also consults a list of exceptions in a separate file
	# called:
	# exceptions-kittel.txt
	# exceptions-dasa.txt
	#
	# Comments on alphabetical order:
	# (1) Dasa sorts (e.g.) aṣṭāṃgī before aṣṭākapāla. Hence ṁ is considered before ALL other consonants.
	# (2) Dasa seems to include vowels WITH anusvāra or candrabindu before vowels WITHOUT anusvāra or
	# candrabindu. This results in some false negatives, but I am not clever enough to sort them
	# out.
	# (3) Kittel will sort words etymologically containing ṟ after words etymologically containing r
	# even if they are written with the same letter in the Kannada script due to sandhi
	# (e.g., akkarte [< akkaṟ] comes after akkaṟe). I don't know how to resolve this easily.
	#
	# Other comments:
	# The transliteration library (sanscript.py) has been modified because SLP1 does
	# not support the letters used for Hindi and Kannada. Please use the accompanying
	# version of sanscript.py rather than the centrally-distributed one.

	import sys, re, sanscript
	from os.path import exists

	def load_exceptions(dictionary):
	exceptions = []
	if exists("exceptions"+dictionary+".txt"):
	with open("exceptions-"+dictionary+".txt","r") as i:
	for w in i.read().splitlines():
	exceptions.append(w.strip())
	return exceptions

	def normalize(word,dictionary):
	new = ""
	stoplist = "-.()?"
	source_script = "devanagari"
	if dictionary == "kittel":
	source_script = "kannada"
	for c in sanscript.transliterate(word,source_script,"slp1"):
	if not c in stoplist:
	if c in alphabet:
	new = new + c
	if dictionary == "ghatage" or dictionary == "kittel":
	new = re.sub(r"M([kKgG]+)",r"N\1",new)
	new = re.sub(r"M([cCjJ]+)",r"Y\1",new)
	new = re.sub(r"M([wWqQ]+)",r"R\1",new)
	new = re.sub(r"M([tTdD]+)",r"n\1",new)
	new = re.sub(r"M([pPbb]+)",r"m\1",new)
	return new

	if __name__ == "__main__":
	if len(sys.argv) > 1:
	inputfile = sys.argv[1]
	with open(inputfile,"r") as textfile:
	text = textfile.read()
	page = ""
	col = ""
	thisentry = ""
	preventry = ""
	prevpreventry = ""
	hw = r'.<hw0>(.)</hw0>.*'
	dictionary = "ghatage"
	source_script = "devanagari"
	alphabet = "aAiIuUfFxXeEoOMkKgGNcCjJYwWqQRtTdDnpPbBmyrlvSzshH"
	if "kittel" in inputfile:
	dictionary = "kittel"
	alphabet = "aAiIuUfFxX1eE2oOMkKgGNcCjJYw3W4qQRtTdDnpPbBmyrVlvSzshHLZ"
	source_script = "kannada"
	elif "dasa" in inputfile:
	dictionary = "dasa"
	alphabet = "aAiIuUfFxXeEoOM~kKgGNcCjJYwWq3Q4RtTdDnpPbBmyrlvSzshH"
	else: # otherwise the dictionary is ghatage
	hw = r'.<hw>(.)</hw>.*'
	exceptions = load_exceptions(dictionary)
	for line in text.splitlines():
	pb = re.match(r'.<pb n="(.?)"/>.*',line)
	cb = re.match(r'.<col n="(.?)"/>.*',line)
	hwre = re.match(hw,line)
	if pb:
	page = int(pb.group(1))
	if cb:
	col = int(cb.group(1))
	if hwre:
	thisentry = hwre.group(1)
	if thisentry not in exceptions:
	if preventry != "":
	if prevpreventry != "":
	frame = [prevpreventry, thisentry]
	framesorted = sorted(frame, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)])
	if frame == framesorted:
	prior = [prevpreventry, preventry, thisentry]
	priorsorted = sorted(prior, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)])
	if prior != priorsorted:
	error = preventry+" ("+sanscript.transliterate(preventry,source_script,"iso")+")"
	previous = sanscript.transliterate(prevpreventry,source_script,"iso")
	following = sanscript.transliterate(thisentry,source_script,"iso")
	print("Page "+str(page)+", column "+str(col)+": "+error+" is in the wrong alphabetical order (previous is "+previous+", following is "+following+").")
	prevpreventry = preventry
	preventry = thisentry
	lastset = [prevpreventry, preventry]
	lastsetsorted = sorted(lastset, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)])
	if lastset != lastsetsorted:
	error = preventry+" ("+sanscript.transliterate(preventry,source_script,"iso")+")"
	print("Page "+str(page)+", column "+str(col)+": "+error+" is in the wrong alphabetical order.")
	else:
	print("No input file specified. Usage: check.py INPUTFILE.")