Last active
June 14, 2019 20:01
-
-
Save aso2101/d427a48ee10d207c78a498640537f30c to your computer and use it in GitHub Desktop.
A Python script for verifying that a Prakrit gāthā is metrically correct.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" Usage: python3 verify_gatha.py FILENAME """ | |
""" Results in FILENAME.err (a list of errors) | |
and FILENAME.log (metrical data) """ | |
""" Take a file in UTF-8 encoding, in the ISO-15919 transliteration | |
scheme, and try to scan its verses and match them against the | |
canonical pattern of the Prakrit gāthā. If there are any errors, | |
output them to an error file. """ | |
""" I have explicitly not done anything about transliteration. | |
sanscript.py works wonderfully, so I will start on the assumption | |
that the file uses the ISO-15919 transliteration scheme. As a | |
consequence, this script uses a short subroutine to replace aspirates | |
with the capitalized versions thereof (e.g., ḍh with Ḍ) for ease of | |
processing.""" | |
""" For the NUMBERING, the following convention is followed: | |
FIRST LINE | |
SECOND LINE - NUMBER | |
Thus the script looks for ARABIC NUMERALS and interprets them | |
as the verse number, and interprets the line in which they occur | |
as the second line in a two-line verse. """ | |
import sys | |
import os | |
import re | |
import collections | |
import csv | |
f = open(sys.argv[1],'r') | |
logFile = open(os.path.splitext(sys.argv[1])[0] + '.log','w') | |
log = '' | |
errorCount = 0 | |
totalsyllables = 0 | |
digits = re.compile(r'(\d+)') | |
vowels = re.compile(r'[aiuïüāĩũãīūeoēō]') | |
longvowels = re.compile(r'[āīūēō]') | |
firstsylls = dict() | |
secondsylls = dict() | |
verses = collections.OrderedDict() | |
# The first element is the verse number, | |
# and the second will be an array consisting of the | |
# two lines. | |
# I use OrderedDict because regular Python dictionaries | |
# don't remember the order in which elements are added. | |
def replaceAspirates(string): | |
# This function replaces the aspirates (which are written | |
# with two characters) with capitalized versions of the non-aspirates, | |
# to make processing easier. a mapping is necessary because | |
# upper() does not work on ḍ, ṭ, and ṇ. | |
aspirates = { "kh":"K", "gh":"G", "ch":"C", "jh":"J", "ṭh":"Ṭ", "ḍh":"Ḍ", "th":"T", "dh":"D", "ph":"P", "bh":"B" } | |
# One might not want to include the nasals here, | |
# since they will often make a syllable long | |
# "ṇh":"Ṇ", "nh":"N", "mh":"M" | |
output = string | |
for k, v in aspirates.items(): | |
output = output.replace(k,v) | |
return output | |
def restoreAspirates(string): | |
aspirates = { "K":"kh", "G":"gh", "C":"ch", "J":"j", "Ṭ":"ṭh", "Ḍ":"ḍh", "T":"t", "D":"dh", "P":"ph", "B":"bh" } | |
output = string | |
for k, v in aspirates.items(): | |
output = output.replace(k,v) | |
return output | |
def writeError(verse,line,ganano,gana): | |
global errorCount | |
global log | |
errorCount += 1 | |
errors = '\nERROR NUMBER '+str(errorCount)+':\n' | |
errors = errors + 'Gaṇa number '+ganano+' in line '+line+' of verse '+verse+' has the incorrect form '+gana+'.\n\n' | |
log = log + errors | |
def scan(string,line,reference,original): | |
global log | |
global firstsylls | |
global secondsylls | |
global totalsyllables | |
ganas = [ ] # we will store the ganas as strings here | |
totalmoras = 0 # the total number of moras in the line | |
ganapattern = '' # the pattern of the gaṇa (in Gs and Ls) | |
ganatext = '' # the text of the gaṇa | |
linepattern = '' # the pattern of the total line | |
linebyganas = '' | |
nosyllables = 0 | |
index = 0 | |
log = log + 'Verse '+reference+', line '+str(line)+': '+original+'\n' | |
while index < len(string): | |
thisletter = string[index] | |
# we will only write PATTERN if the current letter | |
# is a vowel. the consonants before and after it simply | |
# help us determine whether the syllable is light of heavy. | |
# start by seeing if the current letter is a vowel | |
if not vowels.match(thisletter): | |
# if it's a consonant, read it into the text | |
try: | |
nextletter = string[index+1] | |
if vowels.match(nextletter): | |
ganatext = ganatext+restoreAspirates(thisletter) | |
except IndexError: | |
break | |
else: | |
nosyllables += 1 | |
if longvowels.match(thisletter): | |
# if the vowel is inherently long, then it is going | |
# to make the syllable heavy. | |
ganatext = ganatext+thisletter | |
ganapattern = ganapattern+'G' | |
else: | |
# otherwise, we have to look to the next letter. | |
try: | |
nextletter = string[index+1] | |
if vowels.match(nextletter): | |
# V V = L | |
ganatext = ganatext+thisletter | |
ganapattern = ganapattern+'L' | |
elif re.match(r'ṁ',string[index+1]): | |
# V ṁ = G | |
ganatext = ganatext+thisletter+'ṁ' | |
ganapattern = ganapattern+'G' | |
else: | |
try: | |
nextnextletter = string[index+2] | |
if vowels.match(nextnextletter): | |
# V CV | |
ganatext = ganatext+thisletter | |
ganapattern = ganapattern+'L' | |
else: | |
# V CC | |
ganatext = ganatext+thisletter+nextletter | |
ganapattern = ganapattern+'G' | |
except IndexError: | |
# we are at the end of the line | |
ganatext = ganatext+thisletter+nextletter | |
ganapattern = ganapattern+'G' | |
except IndexError: | |
# if there is no following letter, then it is the | |
# end of the line, and it is heavy. | |
ganatext = ganatext+thisletter | |
ganapattern = ganapattern+'G' | |
# now check to see if the gaṇa is complete | |
moras = 0 | |
for syllable in ganapattern: | |
if syllable == 'L': | |
moras = moras + 1 | |
elif syllable == 'G': | |
moras = moras + 2 | |
if ((len(ganas) + 1) % 2) == 1: | |
# if the gana is odd: | |
#print('gana no. '+str(len(ganas)+1)+' is odd in verse '+reference) | |
if not moras < 4: | |
if not (ganapattern == 'GG' or ganapattern == 'GLL' or ganapattern == 'LLG' or ganapattern == 'LLLL'): | |
writeError(reference,str(line),str(len(ganas) + 1),ganapattern) | |
break | |
ganas.append(ganapattern) | |
linebyganas = linebyganas+'['+ganatext+']' | |
linepattern = linepattern+'['+ganapattern+']' | |
ganapattern = '' | |
ganatext = '' | |
moras = 0 | |
# if the gana is even: | |
elif ((len(ganas)+1) % 2) == 0: | |
# if it is the sixth gana in the second line | |
if (len(ganas) + 1) == 6 and line == 2: | |
if not moras < 1: | |
if not ganapattern == 'L': | |
writeError(reference,str(line),str(len(ganas) + 1),ganapattern) | |
break | |
ganas.append(ganapattern) | |
linebyganas = linebyganas+'['+ganatext+']' | |
linepattern = linepattern+'['+ganapattern+']' | |
ganapattern = '' | |
ganatext = '' | |
totalmoras = totalmoras + moras | |
moras = 0 | |
# if it is the sixth gana in the first line | |
elif (len(ganas) + 1) == 6 and line == 1: | |
if not moras < 4: | |
if not (ganapattern == 'LGL' or ganapattern == 'LLLL'): | |
writeError(reference,str(line),str(len(ganas) + 1),ganapattern) | |
break | |
ganas.append(ganapattern) | |
linebyganas = linebyganas+'['+ganatext+']' | |
linepattern = linepattern+'['+ganapattern+']' | |
ganapattern = '' | |
ganatext = '' | |
totalmoras = totalmoras + moras | |
moras = 0 | |
# if it is the last gana in either line | |
elif (len(ganas) + 1) == 8: | |
if not moras < 2: | |
if not ganapattern == 'G': | |
writeError(reference,str(line),str(len(ganas) + 1),ganapattern) | |
break | |
ganas.append(ganapattern) | |
linebyganas = linebyganas+'['+ganatext+']' | |
linepattern = linepattern+'['+ganapattern+']' | |
ganapattern = '' | |
ganatext = '' | |
totalmoras = totalmoras + moras | |
moras = 0 | |
# otherwise | |
else: | |
if not moras < 4: | |
if not (ganapattern == 'LGL' or ganapattern == 'LLLL' or ganapattern == 'GG' or ganapattern == 'LLG' or ganapattern == 'GLL'): | |
writeError(reference,str(line),str(len(ganas) + 1),ganapattern) | |
break | |
ganas.append(ganapattern) | |
linebyganas = linebyganas+'['+ganatext+']' | |
linepattern = linepattern+'['+ganapattern+']' | |
ganapattern = '' | |
ganatext = '' | |
totalmoras = totalmoras + moras | |
moras = 0 | |
index += 1 | |
log = log+' '+linebyganas+'\n' | |
log = log+' '+linepattern+'\n' | |
log = log+' number of syllables: '+str(nosyllables)+'\n' | |
if (line == 1): | |
if str(nosyllables) in firstsylls: | |
firstsylls[str(nosyllables)] += 1 | |
else: | |
firstsylls[str(nosyllables)] = 1 | |
elif (line == 2): | |
if str(nosyllables) in secondsylls: | |
secondsylls[str(nosyllables)] += 1 | |
else: | |
secondsylls[str(nosyllables)] = 1 | |
totalsyllables += nosyllables | |
while True: | |
prevLine = f.readline() | |
currentLine = f.readline() | |
if not currentLine: break # EOF | |
match = digits.search(currentLine) | |
if match: | |
firstline = prevLine.lower().strip() | |
secondline = digits.sub('',currentLine).lower().strip() | |
secondline = secondline.replace(' -','') | |
verses[match.group(0)] = [ firstline, secondline ] | |
for number, verse in verses.items(): | |
firstline = replaceAspirates(verse[0]).replace(' ','').replace('-','') | |
secondline = replaceAspirates(verse[1]).replace(' ','').replace('-','') | |
# scan takes four parameters: the string to scan (preferably with spaces and hyphens removed), | |
# the line number (1 or 2), the number of the verse, and the original text of the verse | |
# (with hyphens and spaces still present) | |
scan(firstline,1,number,verse[0]) | |
scan(secondline,2,number,verse[1]) | |
if errorCount == '0': | |
logFile.write('No metrical errors detected.') | |
print('No metrical errors detected.') | |
else: | |
logFile.write(str(errorCount)+' errors detected. See ERROR lines below for details.\n') | |
logFile.write(str(totalsyllables)+' syllables scanned.\n\n') | |
print(str(errorCount)+' errors detected. See the log file for details.') | |
logFile.write(log) | |
logFile.close() | |
print(firstsylls.keys()) | |
print(firstsylls) | |
with open(os.path.splitext(sys.argv[1])[0] + '-syllables-line1.csv','w') as csvoutput: | |
firstline = collections.OrderedDict(sorted(firstsylls.items())) | |
csvoutput.write("No. of syllables,Instances\n") | |
for key, value in firstline.items(): | |
csvoutput.write(key+","+str(value)+"\n") | |
with open(os.path.splitext(sys.argv[1])[0] + '-syllables-line2.csv','w') as csvoutput: | |
secondline = collections.OrderedDict(sorted(secondsylls.items())) | |
for key, value in secondline.items(): | |
csvoutput.write(key+","+str(value)+"\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment