aso2101/verify_gatha.py

## verify_gatha.py
# -*- coding: utf-8 -*-

""" Usage: python3 verify_gatha.py FILENAME """
""" Results in FILENAME.err (a list of errors)
            and FILENAME.log (metrical data) """

""" Take a file in UTF-8 encoding, in the ISO-15919 transliteration
    scheme, and try to scan its verses and match them against the
    canonical pattern of the Prakrit gāthā. If there are any errors,
    output them to an error file. """

""" I have explicitly not done anything about transliteration.
    sanscript.py works wonderfully, so I will start on the assumption
    that the file uses the ISO-15919 transliteration scheme. As a
    consequence, this script uses a short subroutine to replace aspirates
    with the capitalized versions thereof (e.g., ḍh with Ḍ) for ease of
    processing."""

""" For the NUMBERING, the following convention is followed:
    FIRST LINE
    SECOND LINE - NUMBER
    Thus the script looks for ARABIC NUMERALS and interprets them
    as the verse number, and interprets the line in which they occur
    as the second line in a two-line verse. """

import sys
import os
import re
import collections
import csv

f = open(sys.argv[1],'r')
logFile = open(os.path.splitext(sys.argv[1])[0] + '.log','w')
log = ''
errorCount = 0
totalsyllables = 0
digits = re.compile(r'(\d+)')
vowels = re.compile(r'[aiuïüāĩũãīūeoēō]')
longvowels = re.compile(r'[āīūēō]')
firstsylls = dict()
secondsylls = dict()

verses = collections.OrderedDict()
  # The first element is the verse number,
  # and the second will be an array consisting of the
  # two lines.
  # I use OrderedDict because regular Python dictionaries
  # don't remember the order in which elements are added.

def replaceAspirates(string):
    # This function replaces the aspirates (which are written
    # with two characters) with capitalized versions of the non-aspirates,
    # to make processing easier. a mapping is necessary because
    # upper() does not work on ḍ, ṭ, and ṇ.
    aspirates = { "kh":"K", "gh":"G", "ch":"C", "jh":"J", "ṭh":"Ṭ", "ḍh":"Ḍ", "th":"T", "dh":"D", "ph":"P", "bh":"B" }
    # One might not want to include the nasals here,
    # since they will often make a syllable long
    # "ṇh":"Ṇ", "nh":"N", "mh":"M"
    output = string
    for k, v in aspirates.items():
        output = output.replace(k,v)
    return output

def restoreAspirates(string):
    aspirates = { "K":"kh", "G":"gh", "C":"ch", "J":"j", "Ṭ":"ṭh", "Ḍ":"ḍh", "T":"t", "D":"dh", "P":"ph", "B":"bh" }
    output = string
    for k, v in aspirates.items():
        output = output.replace(k,v)
    return output

def writeError(verse,line,ganano,gana):
    global errorCount
    global log
    errorCount += 1
    errors = '\nERROR NUMBER '+str(errorCount)+':\n'
    errors = errors + 'Gaṇa number '+ganano+' in line '+line+' of verse '+verse+' has the incorrect form '+gana+'.\n\n'
    log = log + errors

def scan(string,line,reference,original):
    global log
    global firstsylls
    global secondsylls
    global totalsyllables
    ganas = [ ] # we will store the ganas as strings here
    totalmoras = 0 # the total number of moras in the line
    ganapattern = '' # the pattern of the gaṇa (in Gs and Ls)
    ganatext = '' # the text of the gaṇa
    linepattern = '' # the pattern of the total line
    linebyganas = ''
    nosyllables = 0
    index = 0
    log = log + 'Verse '+reference+', line '+str(line)+': '+original+'\n'
    while index < len(string):
        thisletter = string[index]
        # we will only write PATTERN if the current letter
        # is a vowel. the consonants before and after it simply
        # help us determine whether the syllable is light of heavy.
        # start by seeing if the current letter is a vowel
        if not vowels.match(thisletter):
            # if it's a consonant, read it into the text
            try:
                nextletter = string[index+1]
                if vowels.match(nextletter):
                    ganatext = ganatext+restoreAspirates(thisletter)
            except IndexError:
                break
        else:
            nosyllables += 1
            if longvowels.match(thisletter):
                # if the vowel is inherently long, then it is going
                # to make the syllable heavy.
                ganatext = ganatext+thisletter
                ganapattern = ganapattern+'G'
            else:
                # otherwise, we have to look to the next letter.
                try:
                    nextletter = string[index+1]
                    if vowels.match(nextletter):
                        # V V = L
                        ganatext = ganatext+thisletter
                        ganapattern = ganapattern+'L'
                    elif re.match(r'ṁ',string[index+1]):
                        # V ṁ = G
                        ganatext = ganatext+thisletter+'ṁ'
                        ganapattern = ganapattern+'G'
                    else:
                        try:
                            nextnextletter = string[index+2]
                            if vowels.match(nextnextletter):
                                # V CV
                                ganatext = ganatext+thisletter
                                ganapattern = ganapattern+'L'
                            else:
                                # V CC
                                ganatext = ganatext+thisletter+nextletter
                                ganapattern = ganapattern+'G'
                        except IndexError:
                            # we are at the end of the line
                            ganatext = ganatext+thisletter+nextletter
                            ganapattern = ganapattern+'G'
                except IndexError:
                    # if there is no following letter, then it is the
                    # end of the line, and it is heavy.
                    ganatext = ganatext+thisletter
                    ganapattern = ganapattern+'G'
        # now check to see if the gaṇa is complete
        moras = 0
        for syllable in ganapattern:
            if syllable == 'L':
                moras = moras + 1
            elif syllable == 'G':
                moras = moras + 2
        if ((len(ganas) + 1) % 2) == 1:
            # if the gana is odd:
            #print('gana no. '+str(len(ganas)+1)+' is odd in verse '+reference)
            if not moras < 4:
                if not (ganapattern == 'GG' or ganapattern == 'GLL' or ganapattern == 'LLG' or ganapattern == 'LLLL'):
                    writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
                    break
                ganas.append(ganapattern)
                linebyganas = linebyganas+'['+ganatext+']'
                linepattern = linepattern+'['+ganapattern+']'
                ganapattern = ''
                ganatext = ''
                moras = 0
        # if the gana is even:
        elif ((len(ganas)+1) % 2) == 0:
            # if it is the sixth gana in the second line
            if (len(ganas) + 1) == 6 and line == 2:
                if not moras < 1:
                    if not ganapattern == 'L':
                        writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
                        break
                    ganas.append(ganapattern)
                    linebyganas = linebyganas+'['+ganatext+']'
                    linepattern = linepattern+'['+ganapattern+']'
                    ganapattern = ''
                    ganatext = ''
                    totalmoras = totalmoras + moras
                    moras = 0
            # if it is the sixth gana in the first line
            elif (len(ganas) + 1) == 6 and line == 1:
                if not moras < 4:
                    if not (ganapattern == 'LGL' or ganapattern == 'LLLL'):
                        writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
                        break
                    ganas.append(ganapattern)
                    linebyganas = linebyganas+'['+ganatext+']'
                    linepattern = linepattern+'['+ganapattern+']'
                    ganapattern = ''
                    ganatext = ''
                    totalmoras = totalmoras + moras
                    moras = 0
            # if it is the last gana in either line
            elif (len(ganas) + 1) == 8:
                if not moras < 2:
                    if not ganapattern == 'G':
                        writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
                        break
                    ganas.append(ganapattern)
                    linebyganas = linebyganas+'['+ganatext+']'
                    linepattern = linepattern+'['+ganapattern+']'
                    ganapattern = ''
                    ganatext = ''
                    totalmoras = totalmoras + moras
                    moras = 0
            # otherwise
            else:
                if not moras < 4:
                    if not (ganapattern == 'LGL' or ganapattern == 'LLLL' or ganapattern == 'GG' or ganapattern == 'LLG' or ganapattern == 'GLL'):
                        writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
                        break
                    ganas.append(ganapattern)
                    linebyganas = linebyganas+'['+ganatext+']'
                    linepattern = linepattern+'['+ganapattern+']'
                    ganapattern = ''
                    ganatext = ''
                    totalmoras = totalmoras + moras
                    moras = 0
        index += 1
    log = log+'     '+linebyganas+'\n'
    log = log+'     '+linepattern+'\n'
    log = log+'     number of syllables: '+str(nosyllables)+'\n'
    if (line == 1):
        if str(nosyllables) in firstsylls:
            firstsylls[str(nosyllables)] += 1
        else:
            firstsylls[str(nosyllables)] = 1
    elif (line == 2):
        if str(nosyllables) in secondsylls:
            secondsylls[str(nosyllables)] += 1
        else:
            secondsylls[str(nosyllables)] = 1
    totalsyllables += nosyllables

while True:
    prevLine = f.readline()
    currentLine = f.readline()
    if not currentLine: break # EOF
    match = digits.search(currentLine)
    if match:
        firstline = prevLine.lower().strip()
        secondline = digits.sub('',currentLine).lower().strip()
        secondline = secondline.replace(' -','')
        verses[match.group(0)] = [ firstline, secondline ]

for number, verse in verses.items():
    firstline = replaceAspirates(verse[0]).replace(' ','').replace('-','')
    secondline = replaceAspirates(verse[1]).replace(' ','').replace('-','')
    # scan takes four parameters: the string to scan (preferably with spaces and hyphens removed),
    # the line number (1 or 2), the number of the verse, and the original text of the verse
    # (with hyphens and spaces still present)
    scan(firstline,1,number,verse[0])
    scan(secondline,2,number,verse[1])

if errorCount == '0':
    logFile.write('No metrical errors detected.')
    print('No metrical errors detected.')
else:
    logFile.write(str(errorCount)+' errors detected. See ERROR lines below for details.\n')
    logFile.write(str(totalsyllables)+' syllables scanned.\n\n')
    print(str(errorCount)+' errors detected. See the log file for details.')

logFile.write(log)
logFile.close()
print(firstsylls.keys())
print(firstsylls)

with open(os.path.splitext(sys.argv[1])[0] + '-syllables-line1.csv','w') as csvoutput:
    firstline = collections.OrderedDict(sorted(firstsylls.items()))
    csvoutput.write("No. of syllables,Instances\n")
    for key, value in firstline.items():
        csvoutput.write(key+","+str(value)+"\n")
with open(os.path.splitext(sys.argv[1])[0] + '-syllables-line2.csv','w') as csvoutput:
    secondline = collections.OrderedDict(sorted(secondsylls.items()))
    for key, value in secondline.items():
        csvoutput.write(key+","+str(value)+"\n")
	# -- coding: utf-8 --

	""" Usage: python3 verify_gatha.py FILENAME """
	""" Results in FILENAME.err (a list of errors)
	and FILENAME.log (metrical data) """

	""" Take a file in UTF-8 encoding, in the ISO-15919 transliteration
	scheme, and try to scan its verses and match them against the
	canonical pattern of the Prakrit gāthā. If there are any errors,
	output them to an error file. """

	""" I have explicitly not done anything about transliteration.
	sanscript.py works wonderfully, so I will start on the assumption
	that the file uses the ISO-15919 transliteration scheme. As a
	consequence, this script uses a short subroutine to replace aspirates
	with the capitalized versions thereof (e.g., ḍh with Ḍ) for ease of
	processing."""

	""" For the NUMBERING, the following convention is followed:
	FIRST LINE
	SECOND LINE - NUMBER
	Thus the script looks for ARABIC NUMERALS and interprets them
	as the verse number, and interprets the line in which they occur
	as the second line in a two-line verse. """

	import sys
	import os
	import re
	import collections
	import csv

	f = open(sys.argv[1],'r')
	logFile = open(os.path.splitext(sys.argv[1])[0] + '.log','w')
	log = ''
	errorCount = 0
	totalsyllables = 0
	digits = re.compile(r'(\d+)')
	vowels = re.compile(r'[aiuïüāĩũãīūeoēō]')
	longvowels = re.compile(r'[āīūēō]')
	firstsylls = dict()
	secondsylls = dict()

	verses = collections.OrderedDict()
	# The first element is the verse number,
	# and the second will be an array consisting of the
	# two lines.
	# I use OrderedDict because regular Python dictionaries
	# don't remember the order in which elements are added.

	def replaceAspirates(string):
	# This function replaces the aspirates (which are written
	# with two characters) with capitalized versions of the non-aspirates,
	# to make processing easier. a mapping is necessary because
	# upper() does not work on ḍ, ṭ, and ṇ.
	aspirates = { "kh":"K", "gh":"G", "ch":"C", "jh":"J", "ṭh":"Ṭ", "ḍh":"Ḍ", "th":"T", "dh":"D", "ph":"P", "bh":"B" }
	# One might not want to include the nasals here,
	# since they will often make a syllable long
	# "ṇh":"Ṇ", "nh":"N", "mh":"M"
	output = string
	for k, v in aspirates.items():
	output = output.replace(k,v)
	return output

	def restoreAspirates(string):
	aspirates = { "K":"kh", "G":"gh", "C":"ch", "J":"j", "Ṭ":"ṭh", "Ḍ":"ḍh", "T":"t", "D":"dh", "P":"ph", "B":"bh" }
	output = string
	for k, v in aspirates.items():
	output = output.replace(k,v)
	return output

	def writeError(verse,line,ganano,gana):
	global errorCount
	global log
	errorCount += 1
	errors = '\nERROR NUMBER '+str(errorCount)+':\n'
	errors = errors + 'Gaṇa number '+ganano+' in line '+line+' of verse '+verse+' has the incorrect form '+gana+'.\n\n'
	log = log + errors

	def scan(string,line,reference,original):
	global log
	global firstsylls
	global secondsylls
	global totalsyllables
	ganas = [ ] # we will store the ganas as strings here
	totalmoras = 0 # the total number of moras in the line
	ganapattern = '' # the pattern of the gaṇa (in Gs and Ls)
	ganatext = '' # the text of the gaṇa
	linepattern = '' # the pattern of the total line
	linebyganas = ''
	nosyllables = 0
	index = 0
	log = log + 'Verse '+reference+', line '+str(line)+': '+original+'\n'
	while index < len(string):
	thisletter = string[index]
	# we will only write PATTERN if the current letter
	# is a vowel. the consonants before and after it simply
	# help us determine whether the syllable is light of heavy.
	# start by seeing if the current letter is a vowel
	if not vowels.match(thisletter):
	# if it's a consonant, read it into the text
	try:
	nextletter = string[index+1]
	if vowels.match(nextletter):
	ganatext = ganatext+restoreAspirates(thisletter)
	except IndexError:
	break
	else:
	nosyllables += 1
	if longvowels.match(thisletter):
	# if the vowel is inherently long, then it is going
	# to make the syllable heavy.
	ganatext = ganatext+thisletter
	ganapattern = ganapattern+'G'
	else:
	# otherwise, we have to look to the next letter.
	try:
	nextletter = string[index+1]
	if vowels.match(nextletter):
	# V V = L
	ganatext = ganatext+thisletter
	ganapattern = ganapattern+'L'
	elif re.match(r'ṁ',string[index+1]):
	# V ṁ = G
	ganatext = ganatext+thisletter+'ṁ'
	ganapattern = ganapattern+'G'
	else:
	try:
	nextnextletter = string[index+2]
	if vowels.match(nextnextletter):
	# V CV
	ganatext = ganatext+thisletter
	ganapattern = ganapattern+'L'
	else:
	# V CC
	ganatext = ganatext+thisletter+nextletter
	ganapattern = ganapattern+'G'
	except IndexError:
	# we are at the end of the line
	ganatext = ganatext+thisletter+nextletter
	ganapattern = ganapattern+'G'
	except IndexError:
	# if there is no following letter, then it is the
	# end of the line, and it is heavy.
	ganatext = ganatext+thisletter
	ganapattern = ganapattern+'G'
	# now check to see if the gaṇa is complete
	moras = 0
	for syllable in ganapattern:
	if syllable == 'L':
	moras = moras + 1
	elif syllable == 'G':
	moras = moras + 2
	if ((len(ganas) + 1) % 2) == 1:
	# if the gana is odd:
	#print('gana no. '+str(len(ganas)+1)+' is odd in verse '+reference)
	if not moras < 4:
	if not (ganapattern == 'GG' or ganapattern == 'GLL' or ganapattern == 'LLG' or ganapattern == 'LLLL'):
	writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
	break
	ganas.append(ganapattern)
	linebyganas = linebyganas+'['+ganatext+']'
	linepattern = linepattern+'['+ganapattern+']'
	ganapattern = ''
	ganatext = ''
	moras = 0
	# if the gana is even:
	elif ((len(ganas)+1) % 2) == 0:
	# if it is the sixth gana in the second line
	if (len(ganas) + 1) == 6 and line == 2:
	if not moras < 1:
	if not ganapattern == 'L':
	writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
	break
	ganas.append(ganapattern)
	linebyganas = linebyganas+'['+ganatext+']'
	linepattern = linepattern+'['+ganapattern+']'
	ganapattern = ''
	ganatext = ''
	totalmoras = totalmoras + moras
	moras = 0
	# if it is the sixth gana in the first line
	elif (len(ganas) + 1) == 6 and line == 1:
	if not moras < 4:
	if not (ganapattern == 'LGL' or ganapattern == 'LLLL'):
	writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
	break
	ganas.append(ganapattern)
	linebyganas = linebyganas+'['+ganatext+']'
	linepattern = linepattern+'['+ganapattern+']'
	ganapattern = ''
	ganatext = ''
	totalmoras = totalmoras + moras
	moras = 0
	# if it is the last gana in either line
	elif (len(ganas) + 1) == 8:
	if not moras < 2:
	if not ganapattern == 'G':
	writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
	break
	ganas.append(ganapattern)
	linebyganas = linebyganas+'['+ganatext+']'
	linepattern = linepattern+'['+ganapattern+']'
	ganapattern = ''
	ganatext = ''
	totalmoras = totalmoras + moras
	moras = 0
	# otherwise
	else:
	if not moras < 4:
	if not (ganapattern == 'LGL' or ganapattern == 'LLLL' or ganapattern == 'GG' or ganapattern == 'LLG' or ganapattern == 'GLL'):
	writeError(reference,str(line),str(len(ganas) + 1),ganapattern)
	break
	ganas.append(ganapattern)
	linebyganas = linebyganas+'['+ganatext+']'
	linepattern = linepattern+'['+ganapattern+']'
	ganapattern = ''
	ganatext = ''
	totalmoras = totalmoras + moras
	moras = 0
	index += 1
	log = log+' '+linebyganas+'\n'
	log = log+' '+linepattern+'\n'
	log = log+' number of syllables: '+str(nosyllables)+'\n'
	if (line == 1):
	if str(nosyllables) in firstsylls:
	firstsylls[str(nosyllables)] += 1
	else:
	firstsylls[str(nosyllables)] = 1
	elif (line == 2):
	if str(nosyllables) in secondsylls:
	secondsylls[str(nosyllables)] += 1
	else:
	secondsylls[str(nosyllables)] = 1
	totalsyllables += nosyllables

	while True:
	prevLine = f.readline()
	currentLine = f.readline()
	if not currentLine: break # EOF
	match = digits.search(currentLine)
	if match:
	firstline = prevLine.lower().strip()
	secondline = digits.sub('',currentLine).lower().strip()
	secondline = secondline.replace(' -','')
	verses[match.group(0)] = [ firstline, secondline ]

	for number, verse in verses.items():
	firstline = replaceAspirates(verse[0]).replace(' ','').replace('-','')
	secondline = replaceAspirates(verse[1]).replace(' ','').replace('-','')
	# scan takes four parameters: the string to scan (preferably with spaces and hyphens removed),
	# the line number (1 or 2), the number of the verse, and the original text of the verse
	# (with hyphens and spaces still present)
	scan(firstline,1,number,verse[0])
	scan(secondline,2,number,verse[1])

	if errorCount == '0':
	logFile.write('No metrical errors detected.')
	print('No metrical errors detected.')
	else:
	logFile.write(str(errorCount)+' errors detected. See ERROR lines below for details.\n')
	logFile.write(str(totalsyllables)+' syllables scanned.\n\n')
	print(str(errorCount)+' errors detected. See the log file for details.')

	logFile.write(log)
	logFile.close()
	print(firstsylls.keys())
	print(firstsylls)

	with open(os.path.splitext(sys.argv[1])[0] + '-syllables-line1.csv','w') as csvoutput:
	firstline = collections.OrderedDict(sorted(firstsylls.items()))
	csvoutput.write("No. of syllables,Instances\n")
	for key, value in firstline.items():
	csvoutput.write(key+","+str(value)+"\n")
	with open(os.path.splitext(sys.argv[1])[0] + '-syllables-line2.csv','w') as csvoutput:
	secondline = collections.OrderedDict(sorted(secondsylls.items()))
	for key, value in secondline.items():
	csvoutput.write(key+","+str(value)+"\n")