Skip to content

Instantly share code, notes, and snippets.

@aso2101
Last active April 6, 2019 21:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aso2101/83d418147908bcfa6a295e6c913b6ad5 to your computer and use it in GitHub Desktop.
Save aso2101/83d418147908bcfa6a295e6c913b6ad5 to your computer and use it in GitHub Desktop.
A python script for parsing Tamil verse into metrical units
# -*- coding: utf-8 -*-
""" Usage: python3 versify.py FILENAME """
""" Results in FILENAME.log (errors and statistics)
and FILENAME.json (a json file of metrically parsed text) """
""" This program expects the text to be in the format
represented by the GRETIL Kuṟuntokai
(http://gretil.sub.uni-goettingen.de/gretil/4_drav/tamil/pm/pm110__u.htm)
namely: the
poem comes on a line such as:
103. neytal - talaivi kūṟṟu
then it is followed by a space, then each of the lines
of the poem, and then
-vāyilāṉ tēvaṉār.
which gives the author. """
import sys
import os
import re
import json
import collections
f = open(sys.argv[1],'r')
logFile = open(os.path.splitext(sys.argv[1])[0] + '.log','w')
jsonFile = open(os.path.splitext(sys.argv[1])[0] + '.json','w')
log = ''
totalSyllables = 0
discardedSyllables = 0
errorCount = 0
authorline = re.compile(r'^-{1}([^-].*)') # group 1 is the author
titleline = re.compile(r'(\d+)\. ([^\s]+) - (.*)') # group 1 is the number, 2 is the tiṇai, and 3 is ?
vowels = re.compile(r'[aiuāĩũãīūeoēōYW]')
longvowels = re.compile(r'[āīūēō]')
light = re.compile(r'([kṅcñṭṇtnpmyrlvḷḻṟṉ])*([aiueoYW])$')
# for the first syllable in a cīr, count ai and au as heavy (not otherwise)
initiallight = re.compile(r'([kṅcñṭṇtnpmyrlvḷḻṟṉ])*([aiueo])$')
cu = re.compile(r'([kṅcñṭṇtnpmyrlvḷḻṟṉ])*(u)$')
poems = []
def replaceDigraphs(string):
digraphs = { "ai":"Y", "au":"W" }
output = string
for k, v in digraphs.items():
output = output.replace(k,v)
return output
def restoreDigraphs(string):
digraphs = { "Y":"ai", "W":"au" }
output = string
for k, v in digraphs.items():
output = output.replace(k,v)
return output
def convertTxtToJson(textfile):
global poems
poem = collections.OrderedDict()
metadata = collections.OrderedDict()
lines = []
while True:
currentLine = textfile.readline()
if not currentLine: break # EOF
else:
numbermatch = titleline.search(currentLine)
authormatch = authorline.search(currentLine)
if numbermatch: # if the reader matches a regex for the number
metadata["number"] = numbermatch.group(1)
metadata["landscape"] = numbermatch.group(2)
metadata["direction"] = numbermatch.group(3)
if authormatch: # if the reader matches a regex for the title
metadata["author"] = authormatch.group(1).replace('.','')
poem["lines"] = lines
poem["metadata"] = metadata
poems.append(poem)
poem = collections.OrderedDict()
metadata = collections.OrderedDict()
lines = []
if authormatch == None and numbermatch == None:
if "number" in metadata:
if currentLine.strip():
lines.append(currentLine.strip().replace('.',''))
def errorMessage(cir,cirnumber,poemnumber,linenumber):
global log
global errorCount
errorCount += 1
log = log+'Error '+str(errorCount)+': Poem no. '+poemnumber+', line '+linenumber+', cīr no. '+cirnumber+': '+restoreDigraphs(cir)+'\n'
def scanLine(line,poemnumber,linenumber):
# this gives an array of cīrs,
# and each cīr is an array of acai,
# and each acai is a dictionary consisting of the syllables in the text,
# the prosodic values (L and G), and the technical term.
global log
global totalSyllables
global discardedSyllables
line = replaceDigraphs(line)
sandhi = re.compile('([kṅcñṭṇtnpmyrlvḷḻṟṉ]) ([aāiīuūeēoōYW])')
sandhimatch = sandhi.search(line)
if sandhimatch:
line = line.replace(sandhimatch.group(0),' '+sandhimatch.group(1)+sandhimatch.group(2))
cirs = line.split(' ')
newcirs = []
for index, cir in enumerate(cirs):
# this will ONLY work if there are a maximum of two acais per cīr
acais = [ {}, {} ]
syllables = syllabize(cir)
totalSyllables += len(syllables)+1
# if there are two syllables, it must be nēr-nēr
if len(syllables) == 2:
firstlight = initiallight.search(syllables[0])
secondlight = light.search(syllables[1])
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]) ],
"type": "nēr",
}
if firstlight:
acais[0]["quantity"] = "L"
else:
acais[0]["quantity"] = "G"
acais[1] = {
"syllables": [ restoreDigraphs(syllables[1]) ],
"type": "nēr"
}
if secondlight:
acais[1]["quantity"] = "L"
else:
acais[1]["quantity"] = "G"
newcirs.append(acais)
# if there are three syllables, it can be either:
# - nēr-nirai
# - nirai-nēr
# - nērpu-nēr
# - nēr-nērpu
# nērpu-nēr is identical to nēr-nirai,
if len(syllables) == 3:
firstlight = initiallight.search(syllables[0])
secondlight = light.search(syllables[1])
thirdlight = light.search(syllables[2])
# if the first is heavy, it is either nēr or nērpu
# since we are only entertaining the hypothesis of nēr-nirai insteard of nērpu-nēr,
# we are assuming it is nēr
if not firstlight:
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]) ],
"type":"nēr",
"quantity":"G"
}
# after a nēracai, only a nērpu or a nirai can follow
if not secondlight:
nerpu = cu.search(syllables[2])
if nerpu:
acais[1] = {
"syllables": [ restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
"type": "nērpu",
"quantity":"Gu"
}
# otherwise there is some problem
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
# otherwise the second acai should be nirai
else:
acais[1] = {
"syllables": [ restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
"type":"nirai"
}
if thirdlight:
acais[1]["quantity"] = "LL"
else:
acais[1]["quantity"] = "LG"
# if the first is light, then the first acai must be nirai
else:
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
"type":"nirai"
}
if secondlight:
acais[0]["quantity"] = "LL"
else:
acais[0]["quantity"] = "LG"
# the third syllable makes up a nēr
acais[1] = {
"syllables": [ restoreDigraphs(syllables[2]) ],
"type":"nēr"
}
if thirdlight:
acais[1]["quantity"] = "L"
else:
acais[1]["quantity"] = "G"
newcirs.append(acais)
# if there are four syllables, the possibilities are:
# - nērpu-nirai (GL-LX) (this is similar to the following,
# but preferred when the last syllable is heavy)
# - nēr-niraipu (G-LXL)
# - nirai-nirai (LX-LX)
# - nirai-nērpu (LX-GL)
# - niraipu-nēr (LXL-G) (indistinguishable from nirai-nirai?)
if len(syllables) == 4:
firstlight = initiallight.search(syllables[0])
secondlight = light.search(syllables[1])
thirdlight = light.search(syllables[2])
fourthlight = light.search(syllables[3])
# if the first is light, it is nirai
# technically it could also be niraipu, but we treat
# niraipu-nēr as nirai-nirai here.
# this means that there is a zero incidence in the corpus
# of four-syllable cīr starting niraipu.
if firstlight:
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
"type":"nirai"
}
if secondlight:
acais[0]["quantity"] = "LL"
else:
acais[0]["quantity"] = "LG"
# from an initial nirai, the second acai could be either nirai or nērpu
# if the third syllable is light, it is nirai
if thirdlight:
acais[1] = {
"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
"type": "nirai"
}
if fourthlight:
acais[1]["quantity"] = "LL"
else:
acais[1]["quantity"] = "LG"
# if the third syllable is heavy, it ought to be nērpu
else:
nerpu = cu.search(syllables[3])
# if it is in fact a nērpu
if nerpu:
acais[1] = {
"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
"type": "nērpu",
"quantity":"Gu"
}
# otherwise throw an error
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
# if the first syllable is heavy, it is either nēr or nērpu
# nērpu needs to followed by nirai, and nēr needs to be followed by niraipu
else:
# if the fourth syllable is heavy, it needs to be nērpu-nirai
# if the fourth syllable is light and not Cu, also take it to be nērpu-nirai
# if the fourth syllable is light and Cu, take it to be nēr-niraipu
# in all of them, the second syllable is light (otherwise error)
if secondlight:
if fourthlight:
niraipu = cu.search(syllables[3])
if niraipu: # nēr-niraipu
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]) ],
"type":"nēr",
"quantity":"G"
}
acais[1] = {
"syllables": [ restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
"type":"niraipu"
}
if thirdlight:
acais[1]["quantity"] = "LLu"
else:
acais[1]["quantity"] = "LGu"
else: # then it is probably nērpu-nirai
nerpu = cu.search(syllables[1])
if nerpu:
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
"type":"nērpu",
"quantity":"Gu"
}
if thirdlight:
acais[1] = {
"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
"type":"nirai",
"quantity":"LL"
}
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
else: # if the fourth syllable is heavy, it must be nērpu-nirai
nerpu = cu.search(syllables[1])
if nerpu:
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
"type":"nērpu",
"quantity":"Gu"
}
if thirdlight:
acais[1] = {
"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
"type":"nirai",
"quantity":"LG"
}
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
newcirs.append(acais)
# if there are five syllables,
# the options are either niraipu-nirai
# or nirai-niraipu.
if len(syllables) == 5:
firstlight = initiallight.search(syllables[0])
secondlight = light.search(syllables[1])
thirdlight = light.search(syllables[2])
fourthlight = light.search(syllables[3])
fifthlight = light.search(syllables[4])
if firstlight:
# if the fifth syllable is heavy, it must be niraipu-nirai
if not fifthlight:
niraipu = cu.search(syllables[2])
if niraipu: # make sure the third syllable is Cu!
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
"type":"niraipu"
}
if secondlight:
acais[0]["quantity"] = "LLu"
else:
acais[0]["quantity"] = "LGu"
if fourthlight:
acais[1] = {
"syllables": [ restoreDigraphs(syllables[3]), restoreDigraphs(syllables[4]) ],
"type":"nirai"
}
if fifthlight:
acais[1]["quantity"] = "LL"
else:
acais[1]["quantity"] = "LG"
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
# if the fifth syllable is light, then
# whether it is niraipu-nirai or nirai-niraipu
# will depend on whether the final syllable (and the third)
# takes the required shape.
else:
secondniraipu = cu.search(syllables[4])
if secondniraipu:
if thirdlight:
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
"type":"nirai"
}
if secondlight:
acais[0]["quantity"] = "LL"
else:
acais[0]["quantity"] = "LG"
acais[1] = {
"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]), restoreDigraphs(syllables[4]) ],
"type":"niraipu"
}
if fourthlight:
acais[1]["quantity"] = "LLu"
else:
acais[1]["quantity"] = "LGu"
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
else: # if the final syllable is not a candidate for niraipu, then let's hope the third is.
firstniraipu = cu.search(syllables[2])
if firstniraipu:
acais[0] = {
"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
"type":"niraipu"
}
if secondlight:
acais[0]["quantity"] = "LLu"
else:
acais[0]["quantity"] = "LGu"
acais[1] = {
"syllables": [ restoreDigraphs(syllables[3]), restoreDigraphs(syllables[4]) ],
"type":"nirai"
}
if fifthlight:
acais[1]["quantity"] = "LL"
else: # this should already be covered by the above case, but just in case...
acais[1]["quantity"] = "LG"
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
# if the first syllable is heavy, then we have a problem...
else:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
newcirs.append(acais)
if len(syllables) == 6:
errorMessage(cir,str(index+1),poemnumber,linenumber)
discardedSyllables += len(syllables)+1
return newcirs
def syllabize(string):
# returns an array of syllables
syllables = []
index = 0
while index < len(string):
thisletter = string[index]
if vowels.match(thisletter):
thissyllable = ''
# check to see if there is an onset
# NB in tamil we only need to check for ONE onset.
try:
prevletter = string[index-1]
if not vowels.match(prevletter):
thissyllable = prevletter
# in case of an indexerror, the vowel is first
except IndexError:
thissyllable = ''
# check to see if there is a coda
try:
nextletter = string[index+1]
if not vowels.match(nextletter): # the next letter is a consonant, keep checking
try:
nextnextletter = string[index+2]
if vowels.match(nextnextletter): # C*V-CV, finished
thissyllable = thissyllable+thisletter
else: # C*VCC, keep checking
try:
nextnextnextletter = string[index+3]
if vowels.match(nextnextnextletter): # C*VC-CV, finished
thissyllable = thissyllable+thisletter+nextletter
else: # C*VCC.C(V), finished
thissyllable = thissyllable+thisletter+nextletter+nextnextletter
# in case of an indexerror, C*VCC, finished
except IndexError:
thissyllable = thissyllable+thisletter+nextletter+nextnextletter
# in case of an indexerror, C*VC, finished
except IndexError:
thissyllable = thissyllable+thisletter+nextletter
# if the next letter is a vowel, C*V-V, finished
else:
thissyllable = thissyllable+thisletter
# in case of an indexerror, C*V, finished
except IndexError:
thissyllable = thissyllable+thisletter
syllables.append(thissyllable)
index += 1
return syllables
def scanLines(poem,index):
global poems
scansion = []
for index, line in enumerate(poem["lines"]):
scannedLine = scanLine(line,poem["metadata"]["number"],str(index+1))
scansion.append(scannedLine)
poem["scansion"] = scansion
poems[index] = poem
def statistics():
strong = {
"nēr": { "G": 0, "L": 0 },
"nirai": { "LG": 0, "LL": 0 },
"nērpu": { "Gu": 0 },
"niraipu": { "LGu": 0, "LLu": 0 }
}
weak = {
"nēr": { "G": 0, "L": 0 },
"nirai": { "LG": 0, "LL": 0 },
"nērpu": { "Gu": 0 },
"niraipu": { "LGu": 0, "LLu": 0 }
}
cirdata = {
"G": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
"L": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
"Gu": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
"LL": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
"LG": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
"LLu": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
"LGu": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 }
}
lstrong = { "1": 0, "2": 0, "3": 0, "4": 0, "5": 0 }
for poem in poems:
for line in poem["scansion"]:
for cirno, cir in enumerate(line):
try:
if "type" in cir[0] and "quantity" in cir[0]:
strong[cir[0]["type"]][cir[0]["quantity"]] += 1
if cir[0]["quantity"] == "L":
lstrong[str(cirno+1)] += 1
try:
if "type" in cir[1] and "quantity" in cir[1]:
weak[cir[1]["type"]][cir[1]["quantity"]] += 1
cirdata[cir[0]["quantity"]][cir[1]["quantity"]] += 1
except IndexError:
print('indexerror for line '+line)
except IndexError:
print('indexerror for line '+line)
totalstrongner = strong["nēr"]["L"] + strong["nēr"]["G"] + strong["nērpu"]["Gu"]
totalstrongnirai = strong["nirai"]["LL"] + strong["nirai"]["LG"] + strong["niraipu"]["LLu"] + strong["niraipu"]["LGu"]
totalstrong = totalstrongner + totalstrongnirai
totalweakner = weak["nēr"]["L"] + weak["nēr"]["G"] + weak["nērpu"]["Gu"]
totalweaknirai = weak["nirai"]["LL"] + weak["nirai"]["LG"] + weak["niraipu"]["LLu"] + weak["niraipu"]["LGu"]
totalweak = totalweakner + totalweaknirai
return '''
Split in strong positions:
Nēr (including nērpu): '''+str(totalstrongner)+''' or '''+str(round((totalstrongner/totalstrong)*100,3))+'''%
- G ('''+str(strong["nēr"]["G"])+''' or '''+str(round((strong["nēr"]["G"]/totalstrongner)*100,3))+'''%)
- L ('''+str(strong["nēr"]["L"])+''' or '''+str(round((strong["nēr"]["L"]/totalstrongner)*100,3))+'''%)
- Gu ('''+str(strong["nērpu"]["Gu"])+''' or '''+str(round((strong["nērpu"]["Gu"]/totalstrongner)*100,3))+'''%)
Nirai (including niraipu): '''+str(totalstrongnirai)+''' or '''+str(round((totalstrongnirai/totalstrong)*100,3))+'''%
- LL ('''+str(strong["nirai"]["LL"])+''' or '''+str(round((strong["nirai"]["LL"]/totalstrongnirai)*100,3))+'''%)
- LG ('''+str(strong["nirai"]["LG"])+''' or '''+str(round((strong["nirai"]["LG"]/totalstrongnirai)*100,3))+'''%)
- LLu ('''+str(strong["niraipu"]["LLu"])+''' or '''+str(round((strong["niraipu"]["LLu"]/totalstrongnirai)*100,3))+'''%)
- LGu ('''+str(strong["niraipu"]["LGu"])+''' or '''+str(round((strong["niraipu"]["LGu"]/totalstrongnirai)*100,3))+'''%)
Split in weak positions:
Nēr (including nērpu): '''+str(totalweakner)+''' or '''+str(round((totalweakner/totalweak)*100,3))+'''%
- G ('''+str(weak["nēr"]["G"])+''' or '''+str(round((weak["nēr"]["G"]/totalweakner)*100,3))+'''%)
- L ('''+str(weak["nēr"]["L"])+''' or '''+str(round((weak["nēr"]["L"]/totalweakner)*100,3))+'''%)
- Gu ('''+str(weak["nērpu"]["Gu"])+''' or '''+str(round((weak["nērpu"]["Gu"]/totalstrongner)*100,3))+'''%)
Nirai (including niraipu): '''+str(totalweaknirai)+''' or '''+str(round((totalweaknirai/totalweak)*100,3))+'''%
- LL ('''+str(weak["nirai"]["LL"])+''' or '''+str(round((weak["nirai"]["LL"]/totalweaknirai)*100,3))+'''%)
- LG ('''+str(weak["nirai"]["LG"])+''' or '''+str(round((weak["nirai"]["LG"]/totalweaknirai)*100,3))+'''%)
- LLu ('''+str(weak["niraipu"]["LLu"])+''' or '''+str(round((weak["niraipu"]["LLu"]/totalweaknirai)*100,3))+'''%)
- LGu ('''+str(weak["niraipu"]["LGu"])+''' or '''+str(round((weak["niraipu"]["LGu"]/totalweaknirai)*100,3))+'''%)
Number of nēr-acai with a L in strong position throughout the line:
- First cīr: '''+str(lstrong["1"])+'''
- Second cīr: '''+str(lstrong["2"])+'''
- Third cīr: '''+str(lstrong["3"])+'''
- Fourth cīr: '''+str(lstrong["4"])+'''
- Fifth cīr: '''+str(lstrong["5"])+'''
Some conditional probabilities for a cīr:
'''+json.dumps(cirdata,indent=4,sort_keys=True,ensure_ascii=False)
convertTxtToJson(f)
for index, poem in enumerate(poems):
scanLines(poem,index)
logFile.write("Scanned "+str(len(poems))+" poems, with "+str(totalSyllables)+" syllables.\n")
if discardedSyllables > 1:
percent = round((discardedSyllables/totalSyllables)*100,3)
logFile.write('''I was unable to parse '''+str(discardedSyllables)+''' syllables ('''+str(percent)+'''%).
Possible reasons for failure:
- The text is incorrect.
- CVR should be counted as light (the parser counts it as heavy).\n\n''')
logFile.write(log)
logFile.write(statistics())
logFile.close()
jsonFile.write(json.dumps(poems, indent=4, sort_keys=True, ensure_ascii=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment