Last active
October 29, 2021 01:48
-
-
Save aso2101/e0d26dea631e73b8e662f95cb38c74c3 to your computer and use it in GitHub Desktop.
Script for checking headwords by alphabetical order
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: | |
# | |
# python3 check.py INPUT_FILE | |
# | |
# where INPUT_FILE is just the name of the file to check. | |
# | |
# This scripts expects that INPUT_FILE will contain one of | |
# the following three strings: | |
# kittel = Kittel's Kannada-English dictionary | |
# ghatage = Ghatage's Prakrit-English dictionary | |
# dasa = Dasa's Hindi-Hindi dictionary | |
# | |
# It also consults a list of exceptions in a separate file | |
# called: | |
# exceptions-kittel.txt | |
# exceptions-dasa.txt | |
# | |
# Comments on alphabetical order: | |
# (1) Dasa sorts (e.g.) aṣṭāṃgī before aṣṭākapāla. Hence ṁ is considered before ALL other consonants. | |
# (2) Dasa seems to include vowels WITH anusvāra or candrabindu before vowels WITHOUT anusvāra or | |
# candrabindu. This results in some false negatives, but I am not clever enough to sort them | |
# out. | |
# (3) Kittel will sort words etymologically containing ṟ after words etymologically containing r | |
# even if they are written with the same letter in the Kannada script due to sandhi | |
# (e.g., akkarte [< akkaṟ] comes after akkaṟe). I don't know how to resolve this easily. | |
# | |
# Other comments: | |
# The transliteration library (sanscript.py) has been modified because SLP1 does | |
# not support the letters used for Hindi and Kannada. Please use the accompanying | |
# version of sanscript.py rather than the centrally-distributed one. | |
import sys, re, sanscript | |
from os.path import exists | |
def load_exceptions(dictionary): | |
exceptions = [] | |
if exists("exceptions"+dictionary+".txt"): | |
with open("exceptions-"+dictionary+".txt","r") as i: | |
for w in i.read().splitlines(): | |
exceptions.append(w.strip()) | |
return exceptions | |
def normalize(word,dictionary): | |
new = "" | |
stoplist = "-.()?" | |
source_script = "devanagari" | |
if dictionary == "kittel": | |
source_script = "kannada" | |
for c in sanscript.transliterate(word,source_script,"slp1"): | |
if not c in stoplist: | |
if c in alphabet: | |
new = new + c | |
if dictionary == "ghatage" or dictionary == "kittel": | |
new = re.sub(r"M([kKgG]+)",r"N\1",new) | |
new = re.sub(r"M([cCjJ]+)",r"Y\1",new) | |
new = re.sub(r"M([wWqQ]+)",r"R\1",new) | |
new = re.sub(r"M([tTdD]+)",r"n\1",new) | |
new = re.sub(r"M([pPbb]+)",r"m\1",new) | |
return new | |
if __name__ == "__main__": | |
if len(sys.argv) > 1: | |
inputfile = sys.argv[1] | |
with open(inputfile,"r") as textfile: | |
text = textfile.read() | |
page = "" | |
col = "" | |
thisentry = "" | |
preventry = "" | |
prevpreventry = "" | |
hw = r'.*<hw0>(.*)</hw0>.*' | |
dictionary = "ghatage" | |
source_script = "devanagari" | |
alphabet = "aAiIuUfFxXeEoOMkKgGNcCjJYwWqQRtTdDnpPbBmyrlvSzshH" | |
if "kittel" in inputfile: | |
dictionary = "kittel" | |
alphabet = "aAiIuUfFxX1eE2oOMkKgGNcCjJYw3W4qQRtTdDnpPbBmyrVlvSzshHLZ" | |
source_script = "kannada" | |
elif "dasa" in inputfile: | |
dictionary = "dasa" | |
alphabet = "aAiIuUfFxXeEoOM~kKgGNcCjJYwWq3Q4RtTdDnpPbBmyrlvSzshH" | |
else: # otherwise the dictionary is ghatage | |
hw = r'.*<hw>(.*)</hw>.*' | |
exceptions = load_exceptions(dictionary) | |
for line in text.splitlines(): | |
pb = re.match(r'.*<pb n="(.*?)"/>.*',line) | |
cb = re.match(r'.*<col n="(.*?)"/>.*',line) | |
hwre = re.match(hw,line) | |
if pb: | |
page = int(pb.group(1)) | |
if cb: | |
col = int(cb.group(1)) | |
if hwre: | |
thisentry = hwre.group(1) | |
if thisentry not in exceptions: | |
if preventry != "": | |
if prevpreventry != "": | |
frame = [prevpreventry, thisentry] | |
framesorted = sorted(frame, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)]) | |
if frame == framesorted: | |
prior = [prevpreventry, preventry, thisentry] | |
priorsorted = sorted(prior, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)]) | |
if prior != priorsorted: | |
error = preventry+" ("+sanscript.transliterate(preventry,source_script,"iso")+")" | |
previous = sanscript.transliterate(prevpreventry,source_script,"iso") | |
following = sanscript.transliterate(thisentry,source_script,"iso") | |
print("Page "+str(page)+", column "+str(col)+": "+error+" is in the wrong alphabetical order (previous is "+previous+", following is "+following+").") | |
prevpreventry = preventry | |
preventry = thisentry | |
lastset = [prevpreventry, preventry] | |
lastsetsorted = sorted(lastset, key=lambda word: [alphabet.index(c) for c in normalize(word,dictionary)]) | |
if lastset != lastsetsorted: | |
error = preventry+" ("+sanscript.transliterate(preventry,source_script,"iso")+")" | |
print("Page "+str(page)+", column "+str(col)+": "+error+" is in the wrong alphabetical order.") | |
else: | |
print("No input file specified. Usage: check.py INPUTFILE.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment