Skip to content

Instantly share code, notes, and snippets.

@aso2101

aso2101/verify_gatha.py

Last active Jun 14, 2019
Embed
What would you like to do?
A Python script for verifying that a Prakrit gāthā is metrically correct.
# -*- coding: utf-8 -*-
""" Usage: python3 verify_gatha.py FILENAME """
""" Results in FILENAME.err (a list of errors)
and FILENAME.log (metrical data) """
""" Take a file in UTF-8 encoding, in the ISO-15919 transliteration
scheme, and try to scan its verses and match them against the
canonical pattern of the Prakrit gāthā. If there are any errors,
output them to an error file. """
""" I have explicitly not done anything about transliteration.
sanscript.py works wonderfully, so I will start on the assumption
that the file uses the ISO-15919 transliteration scheme. As a
consequence, this script uses a short subroutine to replace aspirates
with the capitalized versions thereof (e.g., ḍh with Ḍ) for ease of
processing."""
""" For the NUMBERING, the following convention is followed:
FIRST LINE
SECOND LINE - NUMBER
Thus the script looks for ARABIC NUMERALS and interprets them
as the verse number, and interprets the line in which they occur
as the second line in a two-line verse. """
import sys
import os
import re
import collections
import csv
f = open(sys.argv[1],'r')
logFile = open(os.path.splitext(sys.argv[1])[0] + '.log','w')
log = ''
errorCount = 0
totalsyllables = 0
digits = re.compile(r'(\d+)')
vowels = re.compile(r'[aiuïüāĩũãīūeoēō]')
longvowels = re.compile(r'[āīūēō]')
firstsylls = dict()
secondsylls = dict()
verses = collections.OrderedDict()
# The first element is the verse number,
# and the second will be an array consisting of the
# two lines.
# I use OrderedDict because regular Python dictionaries
# don't remember the order in which elements are added.
def replaceAspirates(string):
# This function replaces the aspirates (which are written
# with two characters) with capitalized versions of the non-aspirates,
# to make processing easier. a mapping is necessary because
# upper() does not work on ḍ, ṭ, and ṇ.
aspirates = { "kh":"K", "gh":"G", "ch":"C", "jh":"J", "ṭh":"Ṭ", "ḍh":"Ḍ", "th":"T", "dh":"D", "ph":"P", "bh":"B" }
# One might not want to include the nasals here,
# since they will often make a syllable long
# "ṇh":"Ṇ", "nh":"N", "mh":"M"
output = string
for k, v in aspirates.items():
output = output.replace(k,v)
return output
def restoreAspirates(string):
aspirates = { "K":"kh", "G":"gh", "C":"ch", "J":"j", "Ṭ":"ṭh", "Ḍ":"ḍh", "T":"t", "D":"dh", "P":"ph", "B":"bh" }
output = string
for k, v in aspirates.items():
output = output.replace(k,v)
return output
def writeError(verse,line,ganano,gana):
global errorCount
global log
errorCount += 1
errors = '\nERROR NUMBER '+str(errorCount)+':\n'
errors = errors + 'Gaṇa number '+ganano+' in line '+line+' of verse '+verse+' has the incorrect form '+gana+'.\n\n'
log = log + errors
def scan(string,line,reference,original):
global log
global firstsylls
global secondsylls
global totalsyllables
ganas = [ ] # we will store the ganas as strings here
totalmoras = 0 # the total number of moras in the line
ganapattern = '' # the pattern of the gaṇa (in Gs and Ls)
ganatext = '' # the text of the gaṇa
linepattern = '' # the pattern of the total line
linebyganas = ''
nosyllables = 0
index = 0
log = log + 'Verse '+reference+', line '+str(line)+': '+original+'\n'
while index < len(string):
thisletter = string[index]
# we will only write PATTERN if the current letter
# is a vowel. the consonants before and after it simply
# help us determine whether the syllable is light of heavy.
# start by seeing if the current letter is a vowel
if not vowels.match(thisletter):
# if it's a consonant, read it into the text
try:
nextletter = string[index+1]
if vowels.match(nextletter):
ganatext = ganatext+restoreAspirates(thisletter)
except IndexError:
break
else:
nosyllables += 1
if longvowels.match(thisletter):
# if the vowel is inherently long, then it is going
# to make the syllable heavy.
ganatext = ganatext+thisletter
ganapattern = ganapattern+'G'
else:
# otherwise, we have to look to the next letter.
try:
nextletter = string[index+1]
if vowels.match(nextletter):
# V V = L
ganatext = ganatext+thisletter
ganapattern = ganapattern+'L'
elif re.match(r'ṁ',string[index+1]):
# V ṁ = G
ganatext = ganatext+thisletter+'ṁ'
ganapattern = ganapattern+'G'
else:
try:
nextnextletter = string[index+2]
if vowels.match(nextnextletter):
# V CV
ganatext = ganatext+thisletter
ganapattern = ganapattern+'L'
else:
# V CC
ganatext = ganatext+thisletter+nextletter
ganapattern = ganapattern+'G'
except IndexError:
# we are at the end of the line
ganatext = ganatext+thisletter+nextletter
ganapattern = ganapattern+'G'
except IndexError:
# if there is no following letter, then it is the
# end of the line, and it is heavy.
ganatext = ganatext+thisletter
ganapattern = ganapattern+'G'
# now check to see if the gaṇa is complete
moras = 0
for syllable in ganapattern:
if syllable == 'L':
moras = moras + 1
elif syllable == 'G':
moras = moras + 2
if ((len(ganas) + 1) % 2) == 1:
# if the gana is odd:
#print('gana no. '+str(len(ganas)+1)+' is odd in verse '+reference)
if not moras < 4:
if not (ganapattern == 'GG' or ganapattern == 'GLL' or ganapattern == 'LLG' or ganapattern == 'LLLL'):
writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
break
ganas.append(ganapattern)
linebyganas = linebyganas+'['+ganatext+']'
linepattern = linepattern+'['+ganapattern+']'
ganapattern = ''
ganatext = ''
moras = 0
# if the gana is even:
elif ((len(ganas)+1) % 2) == 0:
# if it is the sixth gana in the second line
if (len(ganas) + 1) == 6 and line == 2:
if not moras < 1:
if not ganapattern == 'L':
writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
break
ganas.append(ganapattern)
linebyganas = linebyganas+'['+ganatext+']'
linepattern = linepattern+'['+ganapattern+']'
ganapattern = ''
ganatext = ''
totalmoras = totalmoras + moras
moras = 0
# if it is the sixth gana in the first line
elif (len(ganas) + 1) == 6 and line == 1:
if not moras < 4:
if not (ganapattern == 'LGL' or ganapattern == 'LLLL'):
writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
break
ganas.append(ganapattern)
linebyganas = linebyganas+'['+ganatext+']'
linepattern = linepattern+'['+ganapattern+']'
ganapattern = ''
ganatext = ''
totalmoras = totalmoras + moras
moras = 0
# if it is the last gana in either line
elif (len(ganas) + 1) == 8:
if not moras < 2:
if not ganapattern == 'G':
writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
break
ganas.append(ganapattern)
linebyganas = linebyganas+'['+ganatext+']'
linepattern = linepattern+'['+ganapattern+']'
ganapattern = ''
ganatext = ''
totalmoras = totalmoras + moras
moras = 0
# otherwise
else:
if not moras < 4:
if not (ganapattern == 'LGL' or ganapattern == 'LLLL' or ganapattern == 'GG' or ganapattern == 'LLG' or ganapattern == 'GLL'):
writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
break
ganas.append(ganapattern)
linebyganas = linebyganas+'['+ganatext+']'
linepattern = linepattern+'['+ganapattern+']'
ganapattern = ''
ganatext = ''
totalmoras = totalmoras + moras
moras = 0
index += 1
log = log+' '+linebyganas+'\n'
log = log+' '+linepattern+'\n'
log = log+' number of syllables: '+str(nosyllables)+'\n'
if (line == 1):
if str(nosyllables) in firstsylls:
firstsylls[str(nosyllables)] += 1
else:
firstsylls[str(nosyllables)] = 1
elif (line == 2):
if str(nosyllables) in secondsylls:
secondsylls[str(nosyllables)] += 1
else:
secondsylls[str(nosyllables)] = 1
totalsyllables += nosyllables
while True:
prevLine = f.readline()
currentLine = f.readline()
if not currentLine: break # EOF
match = digits.search(currentLine)
if match:
firstline = prevLine.lower().strip()
secondline = digits.sub('',currentLine).lower().strip()
secondline = secondline.replace(' -','')
verses[match.group(0)] = [ firstline, secondline ]
for number, verse in verses.items():
firstline = replaceAspirates(verse[0]).replace(' ','').replace('-','')
secondline = replaceAspirates(verse[1]).replace(' ','').replace('-','')
# scan takes four parameters: the string to scan (preferably with spaces and hyphens removed),
# the line number (1 or 2), the number of the verse, and the original text of the verse
# (with hyphens and spaces still present)
scan(firstline,1,number,verse[0])
scan(secondline,2,number,verse[1])
if errorCount == '0':
logFile.write('No metrical errors detected.')
print('No metrical errors detected.')
else:
logFile.write(str(errorCount)+' errors detected. See ERROR lines below for details.\n')
logFile.write(str(totalsyllables)+' syllables scanned.\n\n')
print(str(errorCount)+' errors detected. See the log file for details.')
logFile.write(log)
logFile.close()
print(firstsylls.keys())
print(firstsylls)
with open(os.path.splitext(sys.argv[1])[0] + '-syllables-line1.csv','w') as csvoutput:
firstline = collections.OrderedDict(sorted(firstsylls.items()))
csvoutput.write("No. of syllables,Instances\n")
for key, value in firstline.items():
csvoutput.write(key+","+str(value)+"\n")
with open(os.path.splitext(sys.argv[1])[0] + '-syllables-line2.csv','w') as csvoutput:
secondline = collections.OrderedDict(sorted(secondsylls.items()))
for key, value in secondline.items():
csvoutput.write(key+","+str(value)+"\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.