aso2101/versify.py

## versify.py
# -*- coding: utf-8 -*-

""" Usage: python3 versify.py FILENAME """
""" Results in FILENAME.log (errors and statistics)
            and FILENAME.json (a json file of metrically parsed text) """

""" This program expects the text to be in the format
    represented by the GRETIL Kuṟuntokai
      (http://gretil.sub.uni-goettingen.de/gretil/4_drav/tamil/pm/pm110__u.htm)
    namely: the
    poem comes on a line such as:
    103. neytal - talaivi kūṟṟu
    then it is followed by a space, then each of the lines
    of the poem, and then
    -vāyilāṉ tēvaṉār.
    which gives the author. """


import sys
import os
import re
import json
import collections

f = open(sys.argv[1],'r')
logFile = open(os.path.splitext(sys.argv[1])[0] + '.log','w')
jsonFile = open(os.path.splitext(sys.argv[1])[0] + '.json','w')
log = ''
totalSyllables = 0
discardedSyllables = 0
errorCount = 0

authorline = re.compile(r'^-{1}([^-].*)') # group 1 is the author
titleline = re.compile(r'(\d+)\. ([^\s]+) - (.*)') # group 1 is the number, 2 is the tiṇai, and 3 is ?
vowels = re.compile(r'[aiuāĩũãīūeoēōYW]')
longvowels = re.compile(r'[āīūēō]')
light = re.compile(r'([kṅcñṭṇtnpmyrlvḷḻṟṉ])*([aiueoYW])$')
# for the first syllable in a cīr, count ai and au as heavy (not otherwise)
initiallight = re.compile(r'([kṅcñṭṇtnpmyrlvḷḻṟṉ])*([aiueo])$')
cu = re.compile(r'([kṅcñṭṇtnpmyrlvḷḻṟṉ])*(u)$')

poems = []

def replaceDigraphs(string):
    digraphs = { "ai":"Y", "au":"W" }
    output = string
    for k, v in digraphs.items():
        output = output.replace(k,v)
    return output

def restoreDigraphs(string):
    digraphs = { "Y":"ai", "W":"au" }
    output = string
    for k, v in digraphs.items():
        output = output.replace(k,v)
    return output

def convertTxtToJson(textfile):
    global poems
    poem = collections.OrderedDict()
    metadata = collections.OrderedDict()
    lines = []
    while True:
        currentLine = textfile.readline()
        if not currentLine: break # EOF
        else:
            numbermatch = titleline.search(currentLine)
            authormatch = authorline.search(currentLine)
            if numbermatch: # if the reader matches a regex for the number
                metadata["number"] =  numbermatch.group(1)
                metadata["landscape"] = numbermatch.group(2)
                metadata["direction"] = numbermatch.group(3)
            if authormatch: # if the reader matches a regex for the title
                metadata["author"] = authormatch.group(1).replace('.','')
                poem["lines"] = lines
                poem["metadata"] = metadata
                poems.append(poem)
                poem = collections.OrderedDict()
                metadata = collections.OrderedDict()
                lines = []
            if authormatch == None and numbermatch == None:
                if "number" in metadata:
                    if currentLine.strip():
                        lines.append(currentLine.strip().replace('.',''))

def errorMessage(cir,cirnumber,poemnumber,linenumber):
    global log
    global errorCount
    errorCount += 1
    log = log+'Error '+str(errorCount)+': Poem no. '+poemnumber+', line '+linenumber+', cīr no. '+cirnumber+': '+restoreDigraphs(cir)+'\n'

def scanLine(line,poemnumber,linenumber):
    # this gives an array of cīrs,
    # and each cīr is an array of acai,
    # and each acai is a dictionary consisting of the syllables in the text,
    # the prosodic values (L and G), and the technical term.
    global log
    global totalSyllables
    global discardedSyllables
    line = replaceDigraphs(line)
    sandhi = re.compile('([kṅcñṭṇtnpmyrlvḷḻṟṉ]) ([aāiīuūeēoōYW])')
    sandhimatch = sandhi.search(line)
    if sandhimatch:
        line = line.replace(sandhimatch.group(0),' '+sandhimatch.group(1)+sandhimatch.group(2))
    cirs = line.split(' ')
    newcirs = []
    for index, cir in enumerate(cirs):
        # this will ONLY work if there are a maximum of two acais per cīr
        acais = [ {}, {} ]
        syllables = syllabize(cir)
        totalSyllables += len(syllables)+1
        # if there are two syllables, it must be nēr-nēr
        if len(syllables) == 2:
            firstlight = initiallight.search(syllables[0])
            secondlight = light.search(syllables[1])
            acais[0] = {
                "syllables": [ restoreDigraphs(syllables[0]) ],
                "type": "nēr",
            }
            if firstlight:
                acais[0]["quantity"] = "L"
            else:
                acais[0]["quantity"] = "G"
            acais[1] = {
                    "syllables": [ restoreDigraphs(syllables[1]) ],
                    "type": "nēr"
                }
            if secondlight:
                acais[1]["quantity"] = "L"
            else:
                acais[1]["quantity"] = "G"
            newcirs.append(acais)
        # if there are three syllables, it can be either:
        #  - nēr-nirai
        #  - nirai-nēr
        #  - nērpu-nēr
        #  - nēr-nērpu
        #  nērpu-nēr is identical to nēr-nirai,
        if len(syllables) == 3:
            firstlight = initiallight.search(syllables[0])
            secondlight = light.search(syllables[1])
            thirdlight = light.search(syllables[2])
            # if the first is heavy, it is either nēr or nērpu
            # since we are only entertaining the hypothesis of nēr-nirai insteard of nērpu-nēr,
            # we are assuming it is nēr
            if not firstlight:
                acais[0] = {
                    "syllables": [ restoreDigraphs(syllables[0]) ],
                    "type":"nēr",
                    "quantity":"G"
                }
                # after a nēracai, only a nērpu or a nirai can follow
                if not secondlight:
                    nerpu = cu.search(syllables[2])
                    if nerpu:
                        acais[1] = {
                            "syllables": [ restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
                            "type": "nērpu",
                            "quantity":"Gu"
                        }
                    # otherwise there is some problem
                    else:
                        errorMessage(cir,str(index+1),poemnumber,linenumber)
                        discardedSyllables += len(syllables)+1
                # otherwise the second acai should be nirai
                else:
                    acais[1] = {
                        "syllables": [ restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
                        "type":"nirai"
                    }
                    if thirdlight:
                        acais[1]["quantity"] = "LL"
                    else:
                        acais[1]["quantity"] = "LG"
            # if the first is light, then the first acai must be nirai
            else:
                acais[0] = {
                    "syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
                    "type":"nirai"
                }
                if secondlight:
                    acais[0]["quantity"] = "LL"
                else:
                    acais[0]["quantity"] = "LG"
                # the third syllable makes up a nēr
                acais[1] = {
                    "syllables": [ restoreDigraphs(syllables[2]) ],
                    "type":"nēr"
                }
                if thirdlight:
                    acais[1]["quantity"] = "L"
                else:
                    acais[1]["quantity"] = "G"
            newcirs.append(acais)
        # if there are four syllables, the possibilities are:
        #  - nērpu-nirai (GL-LX) (this is similar to the following,
        #                         but preferred when the last syllable is heavy)
        #  - nēr-niraipu (G-LXL)
        #  - nirai-nirai (LX-LX)
        #  - nirai-nērpu (LX-GL)
        #  - niraipu-nēr (LXL-G) (indistinguishable from nirai-nirai?)
        if len(syllables) == 4:
            firstlight = initiallight.search(syllables[0])
            secondlight = light.search(syllables[1])
            thirdlight = light.search(syllables[2])
            fourthlight = light.search(syllables[3])
            # if the first is light, it is nirai
            # technically it could also be niraipu, but we treat
            # niraipu-nēr as nirai-nirai here.
            # this means that there is a zero incidence in the corpus
            # of four-syllable cīr starting niraipu.
            if firstlight:
                acais[0] = {
                    "syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
                    "type":"nirai"
                }
                if secondlight:
                    acais[0]["quantity"] = "LL"
                else:
                    acais[0]["quantity"] = "LG"
                # from an initial nirai, the second acai could be either nirai or nērpu
                # if the third syllable is light, it is nirai
                if thirdlight:
                    acais[1] = {
                        "syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
                        "type": "nirai"
                    }
                    if fourthlight:
                        acais[1]["quantity"] = "LL"
                    else:
                        acais[1]["quantity"] = "LG"
                # if the third syllable is heavy, it ought to be nērpu
                else:
                    nerpu = cu.search(syllables[3])
                    # if it is in fact a nērpu
                    if nerpu:
                        acais[1] = {
                            "syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
                            "type": "nērpu",
                            "quantity":"Gu"
                        }
                    # otherwise throw an error
                    else:
                        errorMessage(cir,str(index+1),poemnumber,linenumber)
                        discardedSyllables += len(syllables)+1
            # if the first syllable is heavy, it is either nēr or nērpu
            # nērpu needs to followed by nirai, and nēr needs to be followed by niraipu
            else:
                # if the fourth syllable is heavy, it needs to be nērpu-nirai
                # if the fourth syllable is light and not Cu, also take it to be nērpu-nirai
                # if the fourth syllable is light and Cu, take it to be nēr-niraipu
                # in all of them, the second syllable is light (otherwise error)
                if secondlight:
                    if fourthlight:
                        niraipu = cu.search(syllables[3])
                        if niraipu: # nēr-niraipu
                            acais[0] = {
                                "syllables": [ restoreDigraphs(syllables[0]) ],
                                "type":"nēr",
                                "quantity":"G"
                            }
                            acais[1] = {
                                "syllables": [ restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
                                "type":"niraipu"
                            }
                            if thirdlight:
                                acais[1]["quantity"] = "LLu"
                            else:
                                acais[1]["quantity"] = "LGu"
                        else: # then it is probably nērpu-nirai
                            nerpu = cu.search(syllables[1])
                            if nerpu:
                                acais[0] = {
                                    "syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
                                    "type":"nērpu",
                                    "quantity":"Gu"
                                }
                                if thirdlight:
                                    acais[1] = {
                                        "syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
                                        "type":"nirai",
                                        "quantity":"LL"
                                    }
                                else:
                                    errorMessage(cir,str(index+1),poemnumber,linenumber)
                                    discardedSyllables += len(syllables)+1
                            else:
                                errorMessage(cir,str(index+1),poemnumber,linenumber)
                                discardedSyllables += len(syllables)+1
                    else: # if the fourth syllable is heavy, it must be nērpu-nirai
                        nerpu = cu.search(syllables[1])
                        if nerpu:
                            acais[0] = {
                                "syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
                                "type":"nērpu",
                                "quantity":"Gu"
                            }
                            if thirdlight:
                                acais[1] = {
                                    "syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
                                    "type":"nirai",
                                    "quantity":"LG"
                                }
                            else:
                                errorMessage(cir,str(index+1),poemnumber,linenumber)
                                discardedSyllables += len(syllables)+1
                        else:
                            errorMessage(cir,str(index+1),poemnumber,linenumber)
                            discardedSyllables += len(syllables)+1
                else:
                    errorMessage(cir,str(index+1),poemnumber,linenumber)
                    discardedSyllables += len(syllables)+1
            newcirs.append(acais)
        # if there are five syllables,
        # the options are either niraipu-nirai
        # or nirai-niraipu.
        if len(syllables) == 5:
            firstlight = initiallight.search(syllables[0])
            secondlight = light.search(syllables[1])
            thirdlight = light.search(syllables[2])
            fourthlight = light.search(syllables[3])
            fifthlight = light.search(syllables[4])
            if firstlight:
                # if the fifth syllable is heavy, it must be niraipu-nirai
                if not fifthlight:
                    niraipu = cu.search(syllables[2])
                    if niraipu: # make sure the third syllable is Cu!
                        acais[0] = {
                            "syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
                            "type":"niraipu"
                        }
                        if secondlight:
                            acais[0]["quantity"] = "LLu"
                        else:
                            acais[0]["quantity"] = "LGu"
                        if fourthlight:
                            acais[1] = {
                                "syllables": [ restoreDigraphs(syllables[3]), restoreDigraphs(syllables[4]) ],
                                "type":"nirai"
                            }
                            if fifthlight:
                                acais[1]["quantity"] = "LL"
                            else:
                                acais[1]["quantity"] = "LG"
                        else:
                            errorMessage(cir,str(index+1),poemnumber,linenumber)
                            discardedSyllables += len(syllables)+1
                    else:
                        errorMessage(cir,str(index+1),poemnumber,linenumber)
                        discardedSyllables += len(syllables)+1
                # if the fifth syllable is light, then
                # whether it is niraipu-nirai or nirai-niraipu
                # will depend on whether the final syllable (and the third)
                # takes the required shape.
                else:
                    secondniraipu = cu.search(syllables[4])
                    if secondniraipu:
                        if thirdlight:
                            acais[0] = {
                                "syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
                                "type":"nirai"
                            }
                            if secondlight:
                                acais[0]["quantity"] = "LL"
                            else:
                                acais[0]["quantity"] = "LG"
                            acais[1] = {
                                "syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]), restoreDigraphs(syllables[4]) ],
                                "type":"niraipu"
                            }
                            if fourthlight:
                                acais[1]["quantity"] = "LLu"
                            else:
                                acais[1]["quantity"] = "LGu"
                        else:
                            errorMessage(cir,str(index+1),poemnumber,linenumber)
                            discardedSyllables += len(syllables)+1
                    else: # if the final syllable is not a candidate for niraipu, then let's hope the third is.
                        firstniraipu = cu.search(syllables[2])
                        if firstniraipu:
                            acais[0] = {
                                "syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
                                "type":"niraipu"
                            }
                            if secondlight:
                                acais[0]["quantity"] = "LLu"
                            else:
                                acais[0]["quantity"] = "LGu"
                            acais[1] = {
                                "syllables": [ restoreDigraphs(syllables[3]), restoreDigraphs(syllables[4]) ],
                                "type":"nirai"
                            }
                            if fifthlight:
                                acais[1]["quantity"] = "LL"
                            else: # this should already be covered by the above case, but just in case...
                                acais[1]["quantity"] = "LG"
                        else:
                            errorMessage(cir,str(index+1),poemnumber,linenumber)
                            discardedSyllables += len(syllables)+1
            # if the first syllable is heavy, then we have a problem...
            else:
                errorMessage(cir,str(index+1),poemnumber,linenumber)
                discardedSyllables += len(syllables)+1
            newcirs.append(acais)
        if len(syllables) == 6:
            errorMessage(cir,str(index+1),poemnumber,linenumber)
            discardedSyllables += len(syllables)+1
    return newcirs

def syllabize(string):
    # returns an array of syllables
    syllables = []
    index = 0
    while index < len(string):
        thisletter = string[index]
        if vowels.match(thisletter):
            thissyllable = ''
            # check to see if there is an onset
            # NB in tamil we only need to check for ONE onset.
            try:
                prevletter = string[index-1]
                if not vowels.match(prevletter):
                    thissyllable = prevletter
            # in case of an indexerror, the vowel is first
            except IndexError:
                thissyllable = ''
            # check to see if there is a coda
            try:
                nextletter = string[index+1]
                if not vowels.match(nextletter): # the next letter is a consonant, keep checking
                    try:
                        nextnextletter = string[index+2]
                        if vowels.match(nextnextletter): # C*V-CV, finished
                            thissyllable = thissyllable+thisletter
                        else: # C*VCC, keep checking
                            try:
                                nextnextnextletter = string[index+3]
                                if vowels.match(nextnextnextletter): # C*VC-CV, finished
                                    thissyllable = thissyllable+thisletter+nextletter
                                else: # C*VCC.C(V), finished
                                    thissyllable = thissyllable+thisletter+nextletter+nextnextletter
                                    # in case of an indexerror, C*VCC, finished
                            except IndexError:
                                thissyllable = thissyllable+thisletter+nextletter+nextnextletter
                        # in case of an indexerror, C*VC, finished
                    except IndexError:
                        thissyllable = thissyllable+thisletter+nextletter
                # if the next letter is a vowel, C*V-V, finished
                else:
                    thissyllable = thissyllable+thisletter
            # in case of an indexerror, C*V, finished
            except IndexError:
                thissyllable = thissyllable+thisletter
            syllables.append(thissyllable)
        index += 1
    return syllables

def scanLines(poem,index):
    global poems
    scansion = []
    for index, line in enumerate(poem["lines"]):
        scannedLine = scanLine(line,poem["metadata"]["number"],str(index+1))
        scansion.append(scannedLine)
    poem["scansion"] = scansion
    poems[index] = poem

def statistics():
    strong = {
        "nēr": { "G": 0, "L": 0 },
        "nirai": { "LG": 0, "LL": 0 },
        "nērpu": { "Gu": 0 },
        "niraipu": { "LGu": 0, "LLu": 0 }
    }
    weak = {
        "nēr": { "G": 0, "L": 0 },
        "nirai": { "LG": 0, "LL": 0 },
        "nērpu": { "Gu": 0 },
        "niraipu": { "LGu": 0, "LLu": 0 }
    }
    cirdata = {
        "G": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
        "L": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
        "Gu": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
        "LL": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
        "LG": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
        "LLu": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
        "LGu": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 }
    }
    lstrong = { "1": 0, "2": 0, "3": 0, "4": 0, "5": 0 }
    for poem in poems:
        for line in poem["scansion"]:
            for cirno, cir in enumerate(line):
                try:
                    if "type" in cir[0] and "quantity" in cir[0]:
                        strong[cir[0]["type"]][cir[0]["quantity"]] += 1
                        if cir[0]["quantity"] == "L":
                            lstrong[str(cirno+1)] += 1
                        try:
                            if "type" in cir[1] and "quantity" in cir[1]:
                                weak[cir[1]["type"]][cir[1]["quantity"]] += 1
                                cirdata[cir[0]["quantity"]][cir[1]["quantity"]] += 1
                        except IndexError:
                            print('indexerror for line '+line)
                except IndexError:
                    print('indexerror for line '+line)
    totalstrongner = strong["nēr"]["L"] + strong["nēr"]["G"] + strong["nērpu"]["Gu"]
    totalstrongnirai = strong["nirai"]["LL"] + strong["nirai"]["LG"] + strong["niraipu"]["LLu"] + strong["niraipu"]["LGu"]
    totalstrong = totalstrongner + totalstrongnirai
    totalweakner = weak["nēr"]["L"] + weak["nēr"]["G"] + weak["nērpu"]["Gu"]
    totalweaknirai = weak["nirai"]["LL"] + weak["nirai"]["LG"] + weak["niraipu"]["LLu"] + weak["niraipu"]["LGu"]
    totalweak = totalweakner + totalweaknirai
    return '''
Split in strong positions:
  Nēr (including nērpu): '''+str(totalstrongner)+''' or '''+str(round((totalstrongner/totalstrong)*100,3))+'''%
   - G ('''+str(strong["nēr"]["G"])+''' or '''+str(round((strong["nēr"]["G"]/totalstrongner)*100,3))+'''%)
   - L ('''+str(strong["nēr"]["L"])+''' or '''+str(round((strong["nēr"]["L"]/totalstrongner)*100,3))+'''%)
   - Gu ('''+str(strong["nērpu"]["Gu"])+''' or '''+str(round((strong["nērpu"]["Gu"]/totalstrongner)*100,3))+'''%)
  Nirai (including niraipu): '''+str(totalstrongnirai)+''' or '''+str(round((totalstrongnirai/totalstrong)*100,3))+'''%
   - LL ('''+str(strong["nirai"]["LL"])+''' or '''+str(round((strong["nirai"]["LL"]/totalstrongnirai)*100,3))+'''%)
   - LG ('''+str(strong["nirai"]["LG"])+''' or '''+str(round((strong["nirai"]["LG"]/totalstrongnirai)*100,3))+'''%)
   - LLu ('''+str(strong["niraipu"]["LLu"])+''' or '''+str(round((strong["niraipu"]["LLu"]/totalstrongnirai)*100,3))+'''%)
   - LGu ('''+str(strong["niraipu"]["LGu"])+''' or '''+str(round((strong["niraipu"]["LGu"]/totalstrongnirai)*100,3))+'''%)
Split in weak positions:
  Nēr (including nērpu): '''+str(totalweakner)+''' or '''+str(round((totalweakner/totalweak)*100,3))+'''%
   - G ('''+str(weak["nēr"]["G"])+''' or '''+str(round((weak["nēr"]["G"]/totalweakner)*100,3))+'''%)
   - L ('''+str(weak["nēr"]["L"])+''' or '''+str(round((weak["nēr"]["L"]/totalweakner)*100,3))+'''%)
   - Gu ('''+str(weak["nērpu"]["Gu"])+''' or '''+str(round((weak["nērpu"]["Gu"]/totalstrongner)*100,3))+'''%)
  Nirai (including niraipu): '''+str(totalweaknirai)+''' or '''+str(round((totalweaknirai/totalweak)*100,3))+'''%
   - LL ('''+str(weak["nirai"]["LL"])+''' or '''+str(round((weak["nirai"]["LL"]/totalweaknirai)*100,3))+'''%)
   - LG ('''+str(weak["nirai"]["LG"])+''' or '''+str(round((weak["nirai"]["LG"]/totalweaknirai)*100,3))+'''%)
   - LLu ('''+str(weak["niraipu"]["LLu"])+''' or '''+str(round((weak["niraipu"]["LLu"]/totalweaknirai)*100,3))+'''%)
   - LGu ('''+str(weak["niraipu"]["LGu"])+''' or '''+str(round((weak["niraipu"]["LGu"]/totalweaknirai)*100,3))+'''%)

Number of nēr-acai with a L in strong position throughout the line:
  - First cīr: '''+str(lstrong["1"])+'''
  - Second cīr: '''+str(lstrong["2"])+'''
  - Third cīr: '''+str(lstrong["3"])+'''
  - Fourth cīr: '''+str(lstrong["4"])+'''
  - Fifth cīr: '''+str(lstrong["5"])+'''

Some conditional probabilities for a cīr:
'''+json.dumps(cirdata,indent=4,sort_keys=True,ensure_ascii=False)

convertTxtToJson(f)

for index, poem in enumerate(poems):
    scanLines(poem,index)


logFile.write("Scanned "+str(len(poems))+" poems, with "+str(totalSyllables)+" syllables.\n")
if discardedSyllables > 1:
    percent = round((discardedSyllables/totalSyllables)*100,3)
    logFile.write('''I was unable to parse '''+str(discardedSyllables)+''' syllables ('''+str(percent)+'''%).
    Possible reasons for failure:
      - The text is incorrect.
      - CVR should be counted as light (the parser counts it as heavy).\n\n''')

logFile.write(log)
logFile.write(statistics())
logFile.close()

jsonFile.write(json.dumps(poems, indent=4, sort_keys=True, ensure_ascii=False))
	# -- coding: utf-8 --

	""" Usage: python3 versify.py FILENAME """
	""" Results in FILENAME.log (errors and statistics)
	and FILENAME.json (a json file of metrically parsed text) """

	""" This program expects the text to be in the format
	represented by the GRETIL Kuṟuntokai
	(http://gretil.sub.uni-goettingen.de/gretil/4_drav/tamil/pm/pm110__u.htm)
	namely: the
	poem comes on a line such as:
	103. neytal - talaivi kūṟṟu
	then it is followed by a space, then each of the lines
	of the poem, and then
	-vāyilāṉ tēvaṉār.
	which gives the author. """


	import sys
	import os
	import re
	import json
	import collections

	f = open(sys.argv[1],'r')
	logFile = open(os.path.splitext(sys.argv[1])[0] + '.log','w')
	jsonFile = open(os.path.splitext(sys.argv[1])[0] + '.json','w')
	log = ''
	totalSyllables = 0
	discardedSyllables = 0
	errorCount = 0

	authorline = re.compile(r'^-{1}([^-].*)') # group 1 is the author
	titleline = re.compile(r'(\d+)\. ([^\s]+) - (.*)') # group 1 is the number, 2 is the tiṇai, and 3 is ?
	vowels = re.compile(r'[aiuāĩũãīūeoēōYW]')
	longvowels = re.compile(r'[āīūēō]')
	light = re.compile(r'([kṅcñṭṇtnpmyrlvḷḻṟṉ])*([aiueoYW])$')
	# for the first syllable in a cīr, count ai and au as heavy (not otherwise)
	initiallight = re.compile(r'([kṅcñṭṇtnpmyrlvḷḻṟṉ])*([aiueo])$')
	cu = re.compile(r'([kṅcñṭṇtnpmyrlvḷḻṟṉ])*(u)$')

	poems = []

	def replaceDigraphs(string):
	digraphs = { "ai":"Y", "au":"W" }
	output = string
	for k, v in digraphs.items():
	output = output.replace(k,v)
	return output

	def restoreDigraphs(string):
	digraphs = { "Y":"ai", "W":"au" }
	output = string
	for k, v in digraphs.items():
	output = output.replace(k,v)
	return output

	def convertTxtToJson(textfile):
	global poems
	poem = collections.OrderedDict()
	metadata = collections.OrderedDict()
	lines = []
	while True:
	currentLine = textfile.readline()
	if not currentLine: break # EOF
	else:
	numbermatch = titleline.search(currentLine)
	authormatch = authorline.search(currentLine)
	if numbermatch: # if the reader matches a regex for the number
	metadata["number"] = numbermatch.group(1)
	metadata["landscape"] = numbermatch.group(2)
	metadata["direction"] = numbermatch.group(3)
	if authormatch: # if the reader matches a regex for the title
	metadata["author"] = authormatch.group(1).replace('.','')
	poem["lines"] = lines
	poem["metadata"] = metadata
	poems.append(poem)
	poem = collections.OrderedDict()
	metadata = collections.OrderedDict()
	lines = []
	if authormatch == None and numbermatch == None:
	if "number" in metadata:
	if currentLine.strip():
	lines.append(currentLine.strip().replace('.',''))

	def errorMessage(cir,cirnumber,poemnumber,linenumber):
	global log
	global errorCount
	errorCount += 1
	log = log+'Error '+str(errorCount)+': Poem no. '+poemnumber+', line '+linenumber+', cīr no. '+cirnumber+': '+restoreDigraphs(cir)+'\n'

	def scanLine(line,poemnumber,linenumber):
	# this gives an array of cīrs,
	# and each cīr is an array of acai,
	# and each acai is a dictionary consisting of the syllables in the text,
	# the prosodic values (L and G), and the technical term.
	global log
	global totalSyllables
	global discardedSyllables
	line = replaceDigraphs(line)
	sandhi = re.compile('([kṅcñṭṇtnpmyrlvḷḻṟṉ]) ([aāiīuūeēoōYW])')
	sandhimatch = sandhi.search(line)
	if sandhimatch:
	line = line.replace(sandhimatch.group(0),' '+sandhimatch.group(1)+sandhimatch.group(2))
	cirs = line.split(' ')
	newcirs = []
	for index, cir in enumerate(cirs):
	# this will ONLY work if there are a maximum of two acais per cīr
	acais = [ {}, {} ]
	syllables = syllabize(cir)
	totalSyllables += len(syllables)+1
	# if there are two syllables, it must be nēr-nēr
	if len(syllables) == 2:
	firstlight = initiallight.search(syllables[0])
	secondlight = light.search(syllables[1])
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]) ],
	"type": "nēr",
	}
	if firstlight:
	acais[0]["quantity"] = "L"
	else:
	acais[0]["quantity"] = "G"
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[1]) ],
	"type": "nēr"
	}
	if secondlight:
	acais[1]["quantity"] = "L"
	else:
	acais[1]["quantity"] = "G"
	newcirs.append(acais)
	# if there are three syllables, it can be either:
	# - nēr-nirai
	# - nirai-nēr
	# - nērpu-nēr
	# - nēr-nērpu
	# nērpu-nēr is identical to nēr-nirai,
	if len(syllables) == 3:
	firstlight = initiallight.search(syllables[0])
	secondlight = light.search(syllables[1])
	thirdlight = light.search(syllables[2])
	# if the first is heavy, it is either nēr or nērpu
	# since we are only entertaining the hypothesis of nēr-nirai insteard of nērpu-nēr,
	# we are assuming it is nēr
	if not firstlight:
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]) ],
	"type":"nēr",
	"quantity":"G"
	}
	# after a nēracai, only a nērpu or a nirai can follow
	if not secondlight:
	nerpu = cu.search(syllables[2])
	if nerpu:
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
	"type": "nērpu",
	"quantity":"Gu"
	}
	# otherwise there is some problem
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	# otherwise the second acai should be nirai
	else:
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
	"type":"nirai"
	}
	if thirdlight:
	acais[1]["quantity"] = "LL"
	else:
	acais[1]["quantity"] = "LG"
	# if the first is light, then the first acai must be nirai
	else:
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
	"type":"nirai"
	}
	if secondlight:
	acais[0]["quantity"] = "LL"
	else:
	acais[0]["quantity"] = "LG"
	# the third syllable makes up a nēr
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[2]) ],
	"type":"nēr"
	}
	if thirdlight:
	acais[1]["quantity"] = "L"
	else:
	acais[1]["quantity"] = "G"
	newcirs.append(acais)
	# if there are four syllables, the possibilities are:
	# - nērpu-nirai (GL-LX) (this is similar to the following,
	# but preferred when the last syllable is heavy)
	# - nēr-niraipu (G-LXL)
	# - nirai-nirai (LX-LX)
	# - nirai-nērpu (LX-GL)
	# - niraipu-nēr (LXL-G) (indistinguishable from nirai-nirai?)
	if len(syllables) == 4:
	firstlight = initiallight.search(syllables[0])
	secondlight = light.search(syllables[1])
	thirdlight = light.search(syllables[2])
	fourthlight = light.search(syllables[3])
	# if the first is light, it is nirai
	# technically it could also be niraipu, but we treat
	# niraipu-nēr as nirai-nirai here.
	# this means that there is a zero incidence in the corpus
	# of four-syllable cīr starting niraipu.
	if firstlight:
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
	"type":"nirai"
	}
	if secondlight:
	acais[0]["quantity"] = "LL"
	else:
	acais[0]["quantity"] = "LG"
	# from an initial nirai, the second acai could be either nirai or nērpu
	# if the third syllable is light, it is nirai
	if thirdlight:
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
	"type": "nirai"
	}
	if fourthlight:
	acais[1]["quantity"] = "LL"
	else:
	acais[1]["quantity"] = "LG"
	# if the third syllable is heavy, it ought to be nērpu
	else:
	nerpu = cu.search(syllables[3])
	# if it is in fact a nērpu
	if nerpu:
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
	"type": "nērpu",
	"quantity":"Gu"
	}
	# otherwise throw an error
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	# if the first syllable is heavy, it is either nēr or nērpu
	# nērpu needs to followed by nirai, and nēr needs to be followed by niraipu
	else:
	# if the fourth syllable is heavy, it needs to be nērpu-nirai
	# if the fourth syllable is light and not Cu, also take it to be nērpu-nirai
	# if the fourth syllable is light and Cu, take it to be nēr-niraipu
	# in all of them, the second syllable is light (otherwise error)
	if secondlight:
	if fourthlight:
	niraipu = cu.search(syllables[3])
	if niraipu: # nēr-niraipu
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]) ],
	"type":"nēr",
	"quantity":"G"
	}
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
	"type":"niraipu"
	}
	if thirdlight:
	acais[1]["quantity"] = "LLu"
	else:
	acais[1]["quantity"] = "LGu"
	else: # then it is probably nērpu-nirai
	nerpu = cu.search(syllables[1])
	if nerpu:
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
	"type":"nērpu",
	"quantity":"Gu"
	}
	if thirdlight:
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
	"type":"nirai",
	"quantity":"LL"
	}
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	else: # if the fourth syllable is heavy, it must be nērpu-nirai
	nerpu = cu.search(syllables[1])
	if nerpu:
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
	"type":"nērpu",
	"quantity":"Gu"
	}
	if thirdlight:
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]) ],
	"type":"nirai",
	"quantity":"LG"
	}
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	newcirs.append(acais)
	# if there are five syllables,
	# the options are either niraipu-nirai
	# or nirai-niraipu.
	if len(syllables) == 5:
	firstlight = initiallight.search(syllables[0])
	secondlight = light.search(syllables[1])
	thirdlight = light.search(syllables[2])
	fourthlight = light.search(syllables[3])
	fifthlight = light.search(syllables[4])
	if firstlight:
	# if the fifth syllable is heavy, it must be niraipu-nirai
	if not fifthlight:
	niraipu = cu.search(syllables[2])
	if niraipu: # make sure the third syllable is Cu!
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
	"type":"niraipu"
	}
	if secondlight:
	acais[0]["quantity"] = "LLu"
	else:
	acais[0]["quantity"] = "LGu"
	if fourthlight:
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[3]), restoreDigraphs(syllables[4]) ],
	"type":"nirai"
	}
	if fifthlight:
	acais[1]["quantity"] = "LL"
	else:
	acais[1]["quantity"] = "LG"
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	# if the fifth syllable is light, then
	# whether it is niraipu-nirai or nirai-niraipu
	# will depend on whether the final syllable (and the third)
	# takes the required shape.
	else:
	secondniraipu = cu.search(syllables[4])
	if secondniraipu:
	if thirdlight:
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]) ],
	"type":"nirai"
	}
	if secondlight:
	acais[0]["quantity"] = "LL"
	else:
	acais[0]["quantity"] = "LG"
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[2]), restoreDigraphs(syllables[3]), restoreDigraphs(syllables[4]) ],
	"type":"niraipu"
	}
	if fourthlight:
	acais[1]["quantity"] = "LLu"
	else:
	acais[1]["quantity"] = "LGu"
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	else: # if the final syllable is not a candidate for niraipu, then let's hope the third is.
	firstniraipu = cu.search(syllables[2])
	if firstniraipu:
	acais[0] = {
	"syllables": [ restoreDigraphs(syllables[0]), restoreDigraphs(syllables[1]), restoreDigraphs(syllables[2]) ],
	"type":"niraipu"
	}
	if secondlight:
	acais[0]["quantity"] = "LLu"
	else:
	acais[0]["quantity"] = "LGu"
	acais[1] = {
	"syllables": [ restoreDigraphs(syllables[3]), restoreDigraphs(syllables[4]) ],
	"type":"nirai"
	}
	if fifthlight:
	acais[1]["quantity"] = "LL"
	else: # this should already be covered by the above case, but just in case...
	acais[1]["quantity"] = "LG"
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	# if the first syllable is heavy, then we have a problem...
	else:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	newcirs.append(acais)
	if len(syllables) == 6:
	errorMessage(cir,str(index+1),poemnumber,linenumber)
	discardedSyllables += len(syllables)+1
	return newcirs

	def syllabize(string):
	# returns an array of syllables
	syllables = []
	index = 0
	while index < len(string):
	thisletter = string[index]
	if vowels.match(thisletter):
	thissyllable = ''
	# check to see if there is an onset
	# NB in tamil we only need to check for ONE onset.
	try:
	prevletter = string[index-1]
	if not vowels.match(prevletter):
	thissyllable = prevletter
	# in case of an indexerror, the vowel is first
	except IndexError:
	thissyllable = ''
	# check to see if there is a coda
	try:
	nextletter = string[index+1]
	if not vowels.match(nextletter): # the next letter is a consonant, keep checking
	try:
	nextnextletter = string[index+2]
	if vowels.match(nextnextletter): # C*V-CV, finished
	thissyllable = thissyllable+thisletter
	else: # C*VCC, keep checking
	try:
	nextnextnextletter = string[index+3]
	if vowels.match(nextnextnextletter): # C*VC-CV, finished
	thissyllable = thissyllable+thisletter+nextletter
	else: # C*VCC.C(V), finished
	thissyllable = thissyllable+thisletter+nextletter+nextnextletter
	# in case of an indexerror, C*VCC, finished
	except IndexError:
	thissyllable = thissyllable+thisletter+nextletter+nextnextletter
	# in case of an indexerror, C*VC, finished
	except IndexError:
	thissyllable = thissyllable+thisletter+nextletter
	# if the next letter is a vowel, C*V-V, finished
	else:
	thissyllable = thissyllable+thisletter
	# in case of an indexerror, C*V, finished
	except IndexError:
	thissyllable = thissyllable+thisletter
	syllables.append(thissyllable)
	index += 1
	return syllables

	def scanLines(poem,index):
	global poems
	scansion = []
	for index, line in enumerate(poem["lines"]):
	scannedLine = scanLine(line,poem["metadata"]["number"],str(index+1))
	scansion.append(scannedLine)
	poem["scansion"] = scansion
	poems[index] = poem

	def statistics():
	strong = {
	"nēr": { "G": 0, "L": 0 },
	"nirai": { "LG": 0, "LL": 0 },
	"nērpu": { "Gu": 0 },
	"niraipu": { "LGu": 0, "LLu": 0 }
	}
	weak = {
	"nēr": { "G": 0, "L": 0 },
	"nirai": { "LG": 0, "LL": 0 },
	"nērpu": { "Gu": 0 },
	"niraipu": { "LGu": 0, "LLu": 0 }
	}
	cirdata = {
	"G": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
	"L": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
	"Gu": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
	"LL": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
	"LG": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
	"LLu": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 },
	"LGu": { "G": 0, "L": 0, "Gu": 0, "LL": 0, "LG": 0, "LLu": 0, "LGu": 0 }
	}
	lstrong = { "1": 0, "2": 0, "3": 0, "4": 0, "5": 0 }
	for poem in poems:
	for line in poem["scansion"]:
	for cirno, cir in enumerate(line):
	try:
	if "type" in cir[0] and "quantity" in cir[0]:
	strong[cir[0]["type"]][cir[0]["quantity"]] += 1
	if cir[0]["quantity"] == "L":
	lstrong[str(cirno+1)] += 1
	try:
	if "type" in cir[1] and "quantity" in cir[1]:
	weak[cir[1]["type"]][cir[1]["quantity"]] += 1
	cirdata[cir[0]["quantity"]][cir[1]["quantity"]] += 1
	except IndexError:
	print('indexerror for line '+line)
	except IndexError:
	print('indexerror for line '+line)
	totalstrongner = strong["nēr"]["L"] + strong["nēr"]["G"] + strong["nērpu"]["Gu"]
	totalstrongnirai = strong["nirai"]["LL"] + strong["nirai"]["LG"] + strong["niraipu"]["LLu"] + strong["niraipu"]["LGu"]
	totalstrong = totalstrongner + totalstrongnirai
	totalweakner = weak["nēr"]["L"] + weak["nēr"]["G"] + weak["nērpu"]["Gu"]
	totalweaknirai = weak["nirai"]["LL"] + weak["nirai"]["LG"] + weak["niraipu"]["LLu"] + weak["niraipu"]["LGu"]
	totalweak = totalweakner + totalweaknirai
	return '''
	Split in strong positions:
	Nēr (including nērpu): '''+str(totalstrongner)+''' or '''+str(round((totalstrongner/totalstrong)*100,3))+'''%
	- G ('''+str(strong["nēr"]["G"])+''' or '''+str(round((strong["nēr"]["G"]/totalstrongner)*100,3))+'''%)
	- L ('''+str(strong["nēr"]["L"])+''' or '''+str(round((strong["nēr"]["L"]/totalstrongner)*100,3))+'''%)
	- Gu ('''+str(strong["nērpu"]["Gu"])+''' or '''+str(round((strong["nērpu"]["Gu"]/totalstrongner)*100,3))+'''%)
	Nirai (including niraipu): '''+str(totalstrongnirai)+''' or '''+str(round((totalstrongnirai/totalstrong)*100,3))+'''%
	- LL ('''+str(strong["nirai"]["LL"])+''' or '''+str(round((strong["nirai"]["LL"]/totalstrongnirai)*100,3))+'''%)
	- LG ('''+str(strong["nirai"]["LG"])+''' or '''+str(round((strong["nirai"]["LG"]/totalstrongnirai)*100,3))+'''%)
	- LLu ('''+str(strong["niraipu"]["LLu"])+''' or '''+str(round((strong["niraipu"]["LLu"]/totalstrongnirai)*100,3))+'''%)
	- LGu ('''+str(strong["niraipu"]["LGu"])+''' or '''+str(round((strong["niraipu"]["LGu"]/totalstrongnirai)*100,3))+'''%)
	Split in weak positions:
	Nēr (including nērpu): '''+str(totalweakner)+''' or '''+str(round((totalweakner/totalweak)*100,3))+'''%
	- G ('''+str(weak["nēr"]["G"])+''' or '''+str(round((weak["nēr"]["G"]/totalweakner)*100,3))+'''%)
	- L ('''+str(weak["nēr"]["L"])+''' or '''+str(round((weak["nēr"]["L"]/totalweakner)*100,3))+'''%)
	- Gu ('''+str(weak["nērpu"]["Gu"])+''' or '''+str(round((weak["nērpu"]["Gu"]/totalstrongner)*100,3))+'''%)
	Nirai (including niraipu): '''+str(totalweaknirai)+''' or '''+str(round((totalweaknirai/totalweak)*100,3))+'''%
	- LL ('''+str(weak["nirai"]["LL"])+''' or '''+str(round((weak["nirai"]["LL"]/totalweaknirai)*100,3))+'''%)
	- LG ('''+str(weak["nirai"]["LG"])+''' or '''+str(round((weak["nirai"]["LG"]/totalweaknirai)*100,3))+'''%)
	- LLu ('''+str(weak["niraipu"]["LLu"])+''' or '''+str(round((weak["niraipu"]["LLu"]/totalweaknirai)*100,3))+'''%)
	- LGu ('''+str(weak["niraipu"]["LGu"])+''' or '''+str(round((weak["niraipu"]["LGu"]/totalweaknirai)*100,3))+'''%)

	Number of nēr-acai with a L in strong position throughout the line:
	- First cīr: '''+str(lstrong["1"])+'''
	- Second cīr: '''+str(lstrong["2"])+'''
	- Third cīr: '''+str(lstrong["3"])+'''
	- Fourth cīr: '''+str(lstrong["4"])+'''
	- Fifth cīr: '''+str(lstrong["5"])+'''

	Some conditional probabilities for a cīr:
	'''+json.dumps(cirdata,indent=4,sort_keys=True,ensure_ascii=False)

	convertTxtToJson(f)

	for index, poem in enumerate(poems):
	scanLines(poem,index)


	logFile.write("Scanned "+str(len(poems))+" poems, with "+str(totalSyllables)+" syllables.\n")
	if discardedSyllables > 1:
	percent = round((discardedSyllables/totalSyllables)*100,3)
	logFile.write('''I was unable to parse '''+str(discardedSyllables)+''' syllables ('''+str(percent)+'''%).
	Possible reasons for failure:
	- The text is incorrect.
	- CVR should be counted as light (the parser counts it as heavy).\n\n''')

	logFile.write(log)
	logFile.write(statistics())
	logFile.close()

	jsonFile.write(json.dumps(poems, indent=4, sort_keys=True, ensure_ascii=False))