Skip to content

Instantly share code, notes, and snippets.

@aso2101
Last active October 29, 2021 01:48
Show Gist options
  • Save aso2101/e0d26dea631e73b8e662f95cb38c74c3 to your computer and use it in GitHub Desktop.
Save aso2101/e0d26dea631e73b8e662f95cb38c74c3 to your computer and use it in GitHub Desktop.
Script for checking headwords by alphabetical order
# Usage:
#
# python3 check.py INPUT_FILE
#
# where INPUT_FILE is just the name of the file to check.
#
# This scripts expects that INPUT_FILE will contain one of
# the following three strings:
# kittel = Kittel's Kannada-English dictionary
# ghatage = Ghatage's Prakrit-English dictionary
# dasa = Dasa's Hindi-Hindi dictionary
#
# It also consults a list of exceptions in a separate file
# called:
# exceptions-kittel.txt
# exceptions-dasa.txt
#
# Comments on alphabetical order:
# (1) Dasa sorts (e.g.) aṣṭāṃgī before aṣṭākapāla. Hence ṁ is considered before ALL other consonants.
# (2) Dasa seems to include vowels WITH anusvāra or candrabindu before vowels WITHOUT anusvāra or
# candrabindu. This results in some false negatives, but I am not clever enough to sort them
# out.
# (3) Kittel will sort words etymologically containing ṟ after words etymologically containing r
# even if they are written with the same letter in the Kannada script due to sandhi
# (e.g., akkarte [< akkaṟ] comes after akkaṟe). I don't know how to resolve this easily.
#
# Other comments:
# The transliteration library (sanscript.py) has been modified because SLP1 does
# not support the letters used for Hindi and Kannada. Please use the accompanying
# version of sanscript.py rather than the centrally-distributed one.
import sys, re, sanscript
from os.path import exists
def load_exceptions(dictionary):
exceptions = []
if exists("exceptions"+dictionary+".txt"):
with open("exceptions-"+dictionary+".txt","r") as i:
for w in i.read().splitlines():
exceptions.append(w.strip())
return exceptions
def normalize(word,dictionary):
new = ""
stoplist = "-.()?"
source_script = "devanagari"
if dictionary == "kittel":
source_script = "kannada"
for c in sanscript.transliterate(word,source_script,"slp1"):
if not c in stoplist:
if c in alphabet:
new = new + c
if dictionary == "ghatage" or dictionary == "kittel":
new = re.sub(r"M([kKgG]+)",r"N\1",new)
new = re.sub(r"M([cCjJ]+)",r"Y\1",new)
new = re.sub(r"M([wWqQ]+)",r"R\1",new)
new = re.sub(r"M([tTdD]+)",r"n\1",new)
new = re.sub(r"M([pPbb]+)",r"m\1",new)
return new
if __name__ == "__main__":
if len(sys.argv) > 1:
inputfile = sys.argv[1]
with open(inputfile,"r") as textfile:
text = textfile.read()
page = ""
col = ""
thisentry = ""
preventry = ""
prevpreventry = ""
hw = r'.*<hw0>(.*)</hw0>.*'
dictionary = "ghatage"
source_script = "devanagari"
alphabet = "aAiIuUfFxXeEoOMkKgGNcCjJYwWqQRtTdDnpPbBmyrlvSzshH"
if "kittel" in inputfile:
dictionary = "kittel"
alphabet = "aAiIuUfFxX1eE2oOMkKgGNcCjJYw3W4qQRtTdDnpPbBmyrVlvSzshHLZ"
source_script = "kannada"
elif "dasa" in inputfile:
dictionary = "dasa"
alphabet = "aAiIuUfFxXeEoOM~kKgGNcCjJYwWq3Q4RtTdDnpPbBmyrlvSzshH"
else: # otherwise the dictionary is ghatage
hw = r'.*<hw>(.*)</hw>.*'
exceptions = load_exceptions(dictionary)
for line in text.splitlines():
pb = re.match(r'.*<pb n="(.*?)"/>.*',line)
cb = re.match(r'.*<col n="(.*?)"/>.*',line)
hwre = re.match(hw,line)
if pb:
page = int(pb.group(1))
if cb:
col = int(cb.group(1))
if hwre:
thisentry = hwre.group(1)
if thisentry not in exceptions:
if preventry != "":
if prevpreventry != "":
frame = [prevpreventry, thisentry]
framesorted = sorted(frame, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)])
if frame == framesorted:
prior = [prevpreventry, preventry, thisentry]
priorsorted = sorted(prior, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)])
if prior != priorsorted:
error = preventry+" ("+sanscript.transliterate(preventry,source_script,"iso")+")"
previous = sanscript.transliterate(prevpreventry,source_script,"iso")
following = sanscript.transliterate(thisentry,source_script,"iso")
print("Page "+str(page)+", column "+str(col)+": "+error+" is in the wrong alphabetical order (previous is "+previous+", following is "+following+").")
prevpreventry = preventry
preventry = thisentry
lastset = [prevpreventry, preventry]
lastsetsorted = sorted(lastset, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)])
if lastset != lastsetsorted:
error = preventry+" ("+sanscript.transliterate(preventry,source_script,"iso")+")"
print("Page "+str(page)+", column "+str(col)+": "+error+" is in the wrong alphabetical order.")
else:
print("No input file specified. Usage: check.py INPUTFILE.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment