raffazizzi/README.md

## README.md

      
    Raw
  

              README.md
            
          
    The Integrating Digital Papyrology was an international project that joined together several databases of (mainly) Greek papyri; converted them to Epidoc, a fully conformant TEI P5 flavour. It formed the basis of the editorial project papyri.info.
The source databases had many differences not only in the way the data was structured, but also in the representation of ancient Greek alphabet and editorial intervention. A number of tools were created by the project team (I contributed to the development of some of the tools as well) and I wrote the Python script below to "run" all the programs according to parametrized requirements.
Some of the programs called are XSLTs, which are passed to one Saxon instance through a socket (instead of opening and closing Saxon at every transformation).

  
## runner.py
# -*- coding: utf-8 -*-  #
#
# File: runner.py
#
# Runner for the IDP process
#
# Copyright (C) 2008 by Raffaele Viglianti
# and Centre for Computing in the Humanities, King's College, London.
# Additional contributors' copyright may be designated in individual source files.
# Additional contribution of code to this file by Tom Elliott,  copyright (c)
# 2008 by New York University)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# Information about the EpiDoc community can be obtained via
# http://epidoc.sf.net.

""" runner.py

threads together all the processes necessary to convert ddbdp texts from their legacy
formats to epidoc; for instructions on running this script, type:

    python runner.py help

note: you may need to first type:

    export PYTHONPATH=:$PWD/crosswalker/DDbDP_to_EpiDoc/chetc-py

from the directory containing runner.py (i.e., the epiduke code directory).
"""

import os, os.path, sys, codecs, re, time, datetime, shutil, filecmp, subprocess, socket
import logging as l
# Non-standard modules:
# NB the environment variables for non-standard modules are set by runner.sh
import chetwrap

LHEAD = "################### %s ###################"

# --------------------
# SECTION 1: Functions
# --------------------
# Tools shared by several modules


def walk(inDir, outDir, xslt, level, saxon, outForm, sock):
''' walks recursively through a folder and calls do() on each file excluding .svn folders. With XSLT '''

    if sock == None:
        levels = ''
        for i in range(1, level+1):
            levels = levels+'../'

        epidukesax = 'lib/epiduke-saxon.jar'
        epidukesax = ''.join((levels, epidukesax))
        port = PORT
        cmd=['java', '-jar', '-Xms256m', '-Xmx1536m']
        cmd.append(epidukesax)
        cmd.append('--xsl')
        cmd.append(xslt)
        cmd.append('--port')
        cmd.append('%d' % port)
        cmd.append('--version')
        cmd.append(saxon)
        #subprocess disallows waitpid on Windows
        #use following line on MacOSX
        p = subprocess.Popen(cmd,False)
        #p = os.spawnv(os.P_NOWAIT,myjava,cmd)
        time.sleep(2)  #wait a couple of seconds to give the listener time to fire up
        host = '127.0.0.1'
        mysock = None
        connected = 1
        while connected != 0:
          try:
            mysock = socket.socket(socket.AF_INET , socket.SOCK_STREAM)
            connected = mysock.connect_ex((host, port))
          except socket.error, (errno, strerror):
            print strerror
            time.sleep(2)
        mysock.sendall("PING\n")
        res = mysock.recv(8).rstrip()
        print 'ping response: ' + res
    else:
        mysock = sock

    if not os.path.isdir(outDir):
        os.mkdir(outDir, 0776)

    if inDir[len(inDir)-1]=='/' or inDir[len(inDir)-1]=='\\':
        inDir = inDir[:len(inDir)-1]

    if outDir[len(outDir)-1]=='/' or outDir[len(outDir)-1]=='\\':
        outDir = outDir[:len(outDir)-1]

    for f in os.listdir(inDir):
        ## If the output is html, changes file extension from .xml to .html
        if outForm == 'html':
            fhtml = f[0:-4]

            if not os.path.isdir(inDir+'/'+f) and not '.svn' in f:
                do(inDir+'/'+f, outDir+'/'+fhtml+'.html', outDir, xslt, level, saxon, mysock)
            elif os.path.isdir(inDir+'/'+f):
                walk(inDir+'/'+f, outDir+'/'+f, xslt, level, saxon, outForm, mysock)
        else:
            if not os.path.isdir(inDir+'/'+f) and not '.svn' in f:
                do(inDir+'/'+f, outDir+'/'+f, outDir, xslt, level, saxon, mysock)
            elif os.path.isdir(inDir+'/'+f):
                walk(inDir+'/'+f, outDir+'/'+f, xslt, level, saxon, outForm, mysock)

    if sock == None:
        if mysock != None:
            done = ''
            while done != 'DONE':
                mysock.sendall("DONE?\n")
                done = mysock.recv(128).rstrip()
                time.sleep(2)
                print done
            mysock.sendall("FINISHED\n")
            #use following line on MacOSX
            os.waitpid(p.pid,0)
            #os.waitpid(p,0)
            mysock.close()
## ---

def do(inFile, outFile, outDir, xslt, level, saxon, mysock):
''' applies a xslt to a file. level = how many levels to go up to find lib (saxon) '''

    levels = ''
    for i in range(1, level+1):
        levels = levels+'../'
    logfn = '.'.join((xslt[:-4], 'log'))
    logfn = ''.join((levels, logfn))
    logfn = os.path.abspath(os.path.normcase(logfn))
    l.debug("sending transform call %s %s" % (inFile, outFile))
    sendstr = "%s %s\n" % (inFile, outFile)
    mysock.sendall(sendstr)


## ---

def walk_dt(inDir, switch):
''' walks recursively throug a folder and calls dt() on each file excluding .svn folders. No output folder or XSLT passed through but on or off '''

    if inDir[len(inDir)-1]=='/' or inDir[len(inDir)-1]=='\\':
        inDir = inDir[:len(inDir)-1]

    for f in os.listdir(inDir):
        if not os.path.isdir(inDir+'/'+f) and not '.svn' in f:
            dt(inDir+'/'+f, switch)
        elif os.path.isdir(inDir+'/'+f):
            walk_dt(inDir+'/'+f, switch)

## ---

def dt(inFile, switch):
''' comments out dtd, xml and stylesheet declarations or switch them back on '''

    infileobj = codecs.open(inFile, 'r', encoding='utf8')
    lines = infileobj.readlines()
    infileobj.close()

    content = u''.join([line for line in lines])

    if switch == 'off':

        #Find the stylesheet and XML tags, then comment them out.
        if re.search('<\?\s*xml (.*?)\?>', content, re.S):
            xtag = re.search('<\?\s*xml (.*?)\?>', content, re.S).group(1)
            content = re.sub('<\?\s*xml (.*?)\?>', '<!--xml '+xtag+'-->', content, re.S)

        if re.search('<\?xml-stylesheet (.*?)\?>', content, re.S):
            xtag = re.search('<\?xml-stylesheet (.*?)\?>', content, re.S).group(1)
            if re.search('(")\s*$', xtag, re.S):
                xtag = re.search('(")\s*$', xtag, re.S).group(1)
            content = re.sub('<\?xml-stylesheet (.*?)\?>', '<!--xml-stylesheet '+xtag+'-->', content, re.S)

        #Turns off the DTD
        dtd_rm = re.compile('<!DOCTYPE (.*?)>', re.S)
        if re.search(dtd_rm, content, re.S):
            xtag = re.search(dtd_rm, content, re.S).group(1)
            content =  re.sub(dtd_rm, '<!-- DOCTYPE '+xtag+' -->', content, re.S)

    else:
        if switch == 'on':

            #Turns on the DTD
            dtd_on = re.compile('<!-- DOCTYPE (.*?) -->', re.S)
            if re.search(dtd_on, content, re.S):
                xtag = re.search(dtd_on, content, re.S).group(1)
                content = re.sub(dtd_on, '<!DOCTYPE '+xtag+'>', content, re.S)

            #Finds the commented out stylesheet and XML tags, replace
            #them with active ones.
            if re.search('<!--xml (.*?)-->', content, re.S):
                xtag = re.search('<!--xml (.*?)-->', content, re.S).group(1)
                content = re.sub('<!--xml (.*?)-->', '<?xml '+xtag+'?>', content, re.S)

            if re.search('<!--xml-stylesheet (.*?)-->', content, re.S):
                xtag = re.search('<!--xml-stylesheet (.*?)-->', content, re.S).group(1)
                if re.search('(")\s*$', xtag, re.S):
                    xtag = re.search('(")\s*$', xtag, re.S).group(1)
                content = re.sub('<!--xml-stylesheet (.*?)-->', '<?xml-stylesheet '+xtag+'?>', content, re.S)

    outfileobj = codecs.open(inFile, "w", encoding='utf8')
    outfileobj.writelines(content)
    outfileobj.close()
## ---

def copy_tree(src, dst):
''' copy_tree() copies recursively a directory rooted at src ignoring svn folders. NB: Uses shutil.copyfile() '''

    if not os.path.isdir(dst):
        os.mkdir(dst, 0777)

    if src[len(src)-1]=='/' or src[len(src)-1]=='\\':
        src = src[:len(src)-1]

    if dst[len(dst)-1]=='/' or dst[len(dst)-1]=='\\':
        dst = dst[:len(dst)-1]

    for f in os.listdir(src):

        if not os.path.isdir(src+'/'+f):
            shutil.copyfile(src+'/'+f, dst+'/'+f)
        elif os.path.isdir(src+'/'+f) and not '.svn' in f:
            copy_tree(src+'/'+f, dst+'/'+f)

## ---

# ------------------
# Section 2: MODULES
# ------------------
# Every call to each part of the process is executed by the following functions


# --- XSLT pre transcoder
# --- (step 0)
#
# Status: NOT IN USE
# Date: 12/05/08
# Change system calls.
# ---
def mod_0_Xtrans(pck):

    l.info(LHEAD % 'Pre Transcoder starting')
    xsl_pt_in = "../data/DDB_TEI_XML/ddbdp."+pck+".xml"
    xsl_pt_out = "osx/xslt/output/ddbdp."+pck+".xml"
    os.system('java -jar -Xmx1023m lib/saxon.jar -o '+xsl_pt_out+' '+xsl_pt_in+' osx/xslt/identity-transform.xsl')
    l.info(LHEAD % 'Pre Transcoder complete')

# ---

# --- Transcoder
# --- (step 1)
#
# Status: JAVA CALL
# Date: 12/05/08
# ---
def mod_1_trans(pck):

    l.info(LHEAD % 'Transcoder starting')
    trans_from = "BetaCode"
    trans_to = "UnicodeC"

    if pck == 'test':
        trans_input = "../data/tests/test-input/ddbdp.p.test.xml"
        trans_output = "../data/tests/step-01-trans/ddbdp.p.test.xml"
        os.mkdir('../data/tests/step-01-trans', 0776)
    else:
        trans_input = "../data/DDB_TEI_XML/ddbdp."+pck+".xml"
        trans_output = "../data/DDB_transcoded_XML/ddbdp."+pck+".xml"

    os.system("java -cp lib/transcoder.jar:lib/xercesImpl.jar:lib/xml-apis.jar:lib/serializer.jar:lib/xalan.jar edu.unc.epidoc.transcoder.TransCoder -s "+trans_input+" -o "+trans_output)

    l.info(LHEAD % 'Transcoder complete')
# ---

# --- Splitter
# --- (step 2)
#
# Status: JAVA CALL
# Date: 12/05/08
# ---
def mod_2_split(pck):

    l.info(LHEAD % 'Splitter starting')
    if pck == 'test':
        os.mkdir('../data/tests/step-02-split', 0776)
        split_input = "../data/tests/step-01-trans/ddbdp.p.test.xml"
        split_output = "../data/tests/step-02-split/ddbdp.p.test.xml"
        split_param = "ddbdp.p.test"
    else:
        split_input = "../data/DDB_transcoded_XML/ddbdp."+pck+".xml"
        split_output = "crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/ddbdp."+pck+".xml"
        split_param = "ddbdp."+pck
        os.mkdir('crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/'+pck, 0776)

    os.system('java -jar -Xmx1023m lib/saxon9.jar -t -w1 -o '+split_output+' '+split_input+' crosswalker/DDbDP_to_EpiDoc/split/split.xsl "filename='+split_param+'" 2> report_split_'+pck+'.txt')

    l.info(LHEAD % 'Splitter complete')
# ---

# --- Chetc
# --- (step 3)
#
# Status: OK
# Date: 12/05/08
# ---
def mod_3_chetc(pck):

    l.info(LHEAD % 'CHET-C starting')
    os.chdir("crosswalker/DDbDP_to_EpiDoc/chetc-py") # changes directory to chet-c

    if pck == 'test':
        chetc_input = "../../../../data/tests/step-02-split/p.test"
        chetc_output = "../../../../data/tests/step-03-chetc/p.test"
        os.mkdir('../../../../data/tests/step-03-chetc', 0776)
    else:
        chetc_input = "../split/01-transcoded_xml_id/"+pck
        chetc_output = "output/"+pck
        os.mkdir('output/'+pck, 0776)

    chetwrap.walk(chetc_input, chetc_output, 'ddbdp', pck)


    #Reports handling
    report_name = 'report_'+pck+'.txt'
    report_split_name = 'report_split'+pck+'.txt'
    report_safety_name = 'report_afterSafety_'+pck+'.txt'
    public_location = '/var/www/epiduke/reports/'+curDate+'/'
    if not os.path.isdir(public_location):
        os.mkdir(public_location, 0776)

    do_public = os.path.isdir(public_location)

    if do_public:
        #split reports
        if os.path.isfile(public_location+report_split_name):
            os.unlink(public_location+report_split_name)
        if os.path.isfile(report_split_name):
            shutil.copyfile(report_split_name, public_location+report_split_name)


        #Safety Net report
        if os.path.isfile(public_location+report_name):
            os.unlink(public_location+report_name)
        if os.path.isfile(report_name):
            shutil.copyfile(report_name, public_location+report_name)


        #After Safety Net reports
        if os.path.isfile(public_location+report_safety_name):
            os.unlink(public_location+report_safety_name)
        if os.path.isfile(report_safety_name):
            shutil.copyfile(report_safety_name, public_location+report_safety_name)

    else:
        if do_public:
            if os.path.isfile(public_location+report_split_name):
                l.error("no public_location (%s), so could not copy %s" % (public_location, report_split_name))

            if os.path.isfile(public_location+report_name):
                l.error("no public_location (%s), so could not copy %s" % (public_location, report_name))

            if os.path.isfile(public_location+report_safety_name):
                l.error("no public_location(%s), so could not copy %s" % (public_location, report_safety_name))


    # changes directory back to this program's root
    os.chdir(BASE_DIR)

    l.info(LHEAD % 'CHET-C complete')
# ---

# --- XSLT cleanup
# --- (step 4)
#
# Status: OK
# Date: 12/05/08
# ---
def mod_4_cleanup(pck, g):

    # N.B.: if GreekNumConverter is on, cleanup output goes to crosswalker/DDbDP_to_EpiDoc/xslt_cleanup/output

    l.info(LHEAD % 'XSLT cleanup starting')

    os.chdir("crosswalker/DDbDP_to_EpiDoc/xslt_cleanup") # changes directory to xslt-cleanup

    if pck == 'test':
        clean_input = "../../../../data/tests/step-03-chetc/p.test/"
        if g == 'y':
            os.mkdir('../../../../data/tests/step-04-cleanup', 0776)
            clean_output = "../../../../data/tests/step-04-cleanup/p.test"
        else:
            clean_output = "../../../../data/tests/test-output/p.test"
        os.mkdir('../../../../data/tests/test-output', 0776)
    else:
        clean_input = "../chetc-py/output/"+pck+"/"
        if g == 'y':
            clean_output = "output/"+pck+"/"
            if not os.path.isdir("output"):
                os.mkdir("output/", 0776)
            if not os.path.isdir("output/"+pck+"/"):
                os.mkdir("output/"+pck+"/", 0776)
        else:
            clean_output = "../../../../data/DDB_EpiDoc_XML/"+pck+"/"

    walk(clean_input, clean_output, 'xslt_cleanup_s01.xsl', 3, 'xsl2', 'xml', None)

    os.chdir(BASE_DIR) # changes directory back to this program's root

    l.info(LHEAD % 'XSLT cleanup complete')
# ---

# --- Greek Number Converter (GNC)
# --- (step 5)
#
# Status: OK
# Date: 12/05/08
# ---
def mod_5_gnc(pck):

    l.info(LHEAD % 'Greek number converter starting')

    os.chdir('greek_num_converter') # changes directory to GNC

    if pck == 'test':
        gnc_input = "../../data/tests/step-04-cleanup/p.test/"
        gnc_output = "../../data/tests/test-output/p.test/"
    else:
        gnc_input = "../crosswalker/DDbDP_to_EpiDoc/xslt_cleanup/output/"+pck+"/"
        gnc_output = "../../data/DDB_EpiDoc_XML/"+pck+"/"

    walk_dt(gnc_input, 'off')

    walk(gnc_input, gnc_output, 'grc_num_converter.xsl', 1, 'xsl2', 'xml', None)

    walk_dt(gnc_input, 'on')

    os.chdir(BASE_DIR) # changes directory back to this program's root

    l.info(LHEAD % 'Greek Number Converter complete')

# --- HTML tranformation and publication handling
# --- (step 6)
#
# Status: OK
# Date: 12/05/08
# ---
def mod_6_html(pck, h):

    l.info(LHEAD %  'Publication starting')

    if pck == 'test':
        shutil.rmtree('../data/tests/test-html', 1)
    else:
        if h == 'y':
            shutil.rmtree("../data/tests/run-html/"+pck, 1)

    if h.startswith('y'):

        l.info ('starting HTML transformations')

        os.chdir('epidoc_xslt') # changes directory to epidoc_xslt

        if pck == 'test':
            html_input = "../../data/tests/test-output/p.test/"
            html_output = "../../data/tests/test-html/p.test/"
            os.mkdir('../../data/tests/test-html', 0776)
        else:
            html_input = "../../data/DDB_EpiDoc_XML/"+pck+"/"
            html_output = "../../data/tests/run-html/"+pck+"/"

        walk(html_input, html_output, 'start-edition.xsl', 1, 'xsl1', 'html', None)

        l.info('HTML transformations complete')


    if h == 'yw':

        l.info ('file copy process starting')

        dateDir = '/var/www/epiduke/'+curDate
        htmlDir = dateDir+'/html'
        xmlDir = dateDir+'/xml'
        if not os.path.isdir(dateDir):
            os.mkdir(dateDir, 0776)
        if not os.path.isdir(htmlDir):
            os.mkdir(htmlDir, 0776)
        if not os.path.isdir(xmlDir):
            os.mkdir(xmlDir, 0776)


        copy_tree('../../data/tests/run-html/'+pck+'/', "/var/www/epiduke/"+curDate+"/html/"+pck+"/")
        copy_tree('../../data/DDB_EpiDoc_XML/'+pck+'/', "/var/www/epiduke/"+curDate+"/xml/"+pck+"/")

        #copies collection into batch5 and data for whole process running
        shutil.rmtree("/usr/local/epiduke/data/DDB_EpiDoc_XML/"+pck, 1)
        copy_tree('../../data/DDB_EpiDoc_XML/'+pck+'/', "/usr/local/epiduke/data/DDB_EpiDoc_XML/"+pck+"/")

        if pck == 'test':

            copy_tree('../../data/tests/test-html', "/var/www/epiduke/"+curDate+"/html")
            copy_tree('../../data/tests/test-output', "/var/www/epiduke/"+curDate+"/xml")

            #os.system("cp -r ../../data/tests/test-html/[!.]* /var/www/epiduke/"+curDate+"/html")
            #os.system("cp -r ../../data/tests/test-output/[!.]* /var/www/epiduke/"+curDate+"/xml")

        l.info ('file copy process complete')
        # Copy HTML translation into the current dated directory
        copy_tree('/usr/local/epiduke/data/tests/run-html/trans/', "/var/www/epiduke/"+curDate+"/html/trans/")

    os.chdir(BASE_DIR) # changes directory back to this program's root


    l.info (LHEAD % 'Publication complete')
# ---


# ------------------------
# Section 3: MAIN FUNCTION
# ------------------------
# Calls all the modules requested

def run(pck, to, fro, mods, keep):

    if keep == 0:
        l.info('Intermediate files will be removed at the end of the process')

    #Modules switch
    mod_switch = [
    ["transcoder", 0],
    ["splitter", 0],
    ["chetc", 0],
    ["cleanup", 0],
    ["gnc", 0],
    ["html", 0],
    ["htmlw", 0],
    ]

    #Determine active modules and activates them.
    # "Up to"
    if to == 1:
        for i, m in enumerate(mod_switch):
            if mods[0] == m[0]:
                #activates final module
                m[1] = 1
                #activates preceding modules
                for e in range(0, i):
                    mod_switch[e][1] = 1
    # "From"
    if fro == 1:
        for i, m in enumerate(mod_switch):
            if mods[0] == m[0]:
                #activates starting module
                #activates preceding modules
                for e in range(i, len(mod_switch)):
                    mod_switch[e][1] = 1
    # One or more
    else:
        if mods[0] == 'all':
            for m in mod_switch:
                m[1] = 1
        else:
            for given in mods:
                for m in mod_switch:
                    if given == m[0]:
                        m[1] = 1


    #If htmlw is on, html is not included in the process
    if mod_switch[6][1] == 1:
        mod_switch[5][1] == 0

    #Run modules

    #TRANSCODER
    if mod_switch[0][1] == 1:
        if pck == 'test':
            shutil.rmtree("../data/tests/step-01-trans", 1)
        else:
            try:
                os.unlink("../data/DDB_transcoded_XML/ddbdp."+pck+".xml")
                #os.unlink("osx/xslt/output/ddbdp."+pck+".xml")
            except os.error:
                l.warning("failed: os.unlink('../data/DDB_transcoded_XML/ddbdp.%s.xml')" % pck)

        #mod_0_Xtrans(pck)
        mod_1_trans(pck)

    #SPLITTER
    if mod_switch[1][1] == 1:
        if pck == 'test':
            shutil.rmtree("../data/tests/step-02-split", 1)
        else:
            shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/"+pck, 1)

        mod_2_split(pck)

    #CHETC
    if mod_switch[2][1] == 1:
        if pck == 'test':
            shutil.rmtree("../data/tests/step-03-chetc", 1)
        else:
            shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/chetc-py/output/"+pck, 1)

        mod_3_chetc(pck)

    #CLEANUP
    if mod_switch[3][1] == 1:
        if pck == 'test':
            shutil.rmtree("../data/tests/test-output", 1)
            shutil.rmtree("../data/tests/step-04-cleanup", 1) #used when GNC is on
        else:
            shutil.rmtree("../data/DDB_EpiDoc_XML/"+pck, 1)
        #if gnc is on...
        if mod_switch[4][1] == 1:
            g = 'y'
            shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/xslt_cleanup/output/"+pck, 1)
        else:
            g ='n'

        mod_4_cleanup(pck, g)

    #GNC
    if mod_switch[4][1] == 1:
        mod_5_gnc(pck)

    #HTML
    if mod_switch[5][1] == 1:
        if pck == 'test':
            shutil.rmtree("../data/tests/test-html", 1)
        else:
            shutil.rmtree("../data/tests/run-html/"+pck, 1)

        mod_6_html(pck, 'y')

    #HTMLW
    if mod_switch[6][1] == 1:
        if pck == 'test':
            shutil.rmtree("../data/tests/test-html", 1)

        mod_6_html(pck, 'yw')

    # If intermediate files must be removed... (To be cleaned up a little bit, but working)
    if keep == 0:
        if pck == 'test':
            shutil.rmtree("../data/tests/step-01-trans", 1)
        else:
            try:
                os.unlink("../data/DDB_transcoded_XML/ddbdp."+pck+".xml")
                os.unlink("osx/xslt/output/ddbdp."+pck+".xml")
            except os.error:
                l.warning("failed: os.unlink('../data/DDB_transcoded_XML/ddbdp.%s.xml')" % pck)

        if pck == 'test':
            shutil.rmtree("../data/tests/step-02-split", 1)
        else:
            shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/"+pck, 1)

        if pck == 'test':
            shutil.rmtree("../data/tests/step-03-chetc", 1)
        else:
            shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/chetc-py/output/"+pck, 1)

        if pck == 'test':
            shutil.rmtree("../data/tests/test-html", 1)
        else:
            shutil.rmtree("../data/tests/run-html/"+pck, 1)

    #Change permissions on unix machines
    #Change group to epiduke and make the files group writable.

    #~ os.system("chgrp -R -f epiduke /usr/local/epiduke/*")
    #~ os.system("chgrp -R -f epiduke /var/www/epiduke/*")
    #~ os.system("chmod -R -f g+w /usr/local/epiduke/*")
    #~ os.system("chmod -R -f g+w /var/www/epiduke/*")
    #COMMENTED OUT AND MOVED TO RUNNER.SH (RV 14/05/08)

    #Switching off all modules:
    for m in mod_switch:
        m[1] = 0

    l.info (LHEAD % ('Runner complete for collection: ' + pck))

# ---
# ---

# ---------------------
# Section 0: runner.py
# ---------------------
# Manages the options and calls run()
# To do:
# - Subversion?
# - Log

if __name__ == "__main__":

    #configure logger
    l.basicConfig(level=l.DEBUG)

    #BASE DIRECTORY
    BASE_DIR = os.path.abspath(os.path.dirname(sys.argv[0]))
    l.info ("BASE_DIR is %s" % BASE_DIR)

    #INPUT:
    root_input = '../data/DDB_TEI_XML/'

    #COLLECTIONS LIST
    pck_list = []
    for root, dirs, files in os.walk('../data/DDB_TEI_XML/'):
        for f in files:
            m = re.search('^ddbdp\.(.*?)\.xml$', f)
            if m:
                collection = m.group(1)
                pck_list.append(collection)

    if len(pck_list) == 0:
        l.critical("Runner cannot find the collections.\nPlease check that the collections are in %s" % root_input)
        sys.exit()

    #MODULES DICTIONARY
    mod_list = [ "transcoder", "splitter", "chetc", "cleanup", "gnc", "html", "htmlw" ]

    #CURRENT DATE
    curDate = str(datetime.date.today())

    #USAGE
    usage = '''\nRunner usage:\npython runner.py [col[,col] | all] [[to | from] mod | [mod[,mod] | all] [-r] [port]\n
    Specify one collection or a list of collections separated by commas.\n
    Specify 'to' and ONE module to run the process up to that module
    or specify a list of modules separated by commas.\n
    Specify -r to remove intermediate files at the end of the process\n
    State the port, it must begin with '999' and be a number\n
    python runner.py [help | h | ?] of for further information\n\n
    '''

    #HELP
    help = '''\nRunner usage:\npython runner.py [col[,col] | all] [[to | from] mod | [mod[,mod] | all] [-r] [port]\n
    Specify one collection or a list of collections separated by commas.\n
    Specify 'to' and ONE module to run the process up to that module
    or specify a list of modules separated by commas.\n
    Specify -r to remove intermediate files at the end of the process\n
    State the port, it must begin with '999' and be a number\n
    python runner.py [help | h | ?] of for this help\n
    Available modules:\n
    transcoder    Runs the transcoder (Java)
    splitter      Runs splitter (XLST)
    chetc         Runs CHET-C converter (Python)
    cleanup       Runs CHET-C cleanup (XSLT)
    gnc           Runs Greek Number Converter (XSLT)
    html          Runs the HTML output (XSLT)
    htmlw         Runs the HTML output and copies XML\n
    To run one or more modules:
    python runner.py [pck[,pck] | all] [mod[,mod]] [port]\n
    To run the process up to a module:
    python runner.py [pck[,pck] | all] to [mod] [port]\n
    To run the process starting from a module:
    python runner.py [pck[,pck] | all] from [mod] [port]\n
    To run the complete process:
    python runner.py [pck[,pck] | all] all [port]\n
    '''


    # Checking arguments...
    if sys.argv[1] == 'help' or  sys.argv[1] == 'h' or sys.argv[1] == '?':
        print help
        sys.exit()

    if len(sys.argv) > 6 or len(sys.argv) < 3:
        print usage
        sys.exit()


#(PORT) Managing the last argument
    PORT = sys.argv[len(sys.argv)-1]
    if not PORT.startswith('999') :
        print usage
        sys.exit()

    PORT = int(PORT)

#(COLLECTIONS) Managing the first argument
    COLLECTIONS = sys.argv[1]

    if ',' in COLLECTIONS:
        given_col_list = COLLECTIONS.split(',')
    else:
        given_col_list = [COLLECTIONS]

    #Checking spelling
    l.info('Checking Collections...')
    for g in given_col_list:
        if g == 'all':
            if len(given_col_list) == 1:
                l.info('Runner is about to process all the collections...')
            else:
                l.critical("Runner cannot determine the collections to be processed.\nReceived: %s" % given_col_list)
                sys.exit()
        elif g == 'test':
            l.info ("%s collection found..." % g)
        elif g not in pck_list:
            l.critical ("%s is not present" % g)
            sys.exit()
        else:
            l.info ("%s collection found..." % g)


#(MODULES) Managing the second and third arguments

    # Checking if the process has to ben run "up to" or "from" a certain module
    # or if it has to run certain modules only.
    to = 0
    fro = 0
    if sys.argv[2].lower() == 'to':
        to = 1
        if len(sys.argv) == 4 or len(sys.argv) == 5:
            MODULES = sys.argv[3]
        else:
            l.critical("Runner cannot find the modules specified because 'to' was specified, but the number of arguments was unexpected")
            sys.exit()
    elif sys.argv[2].lower() == 'from':
        fro = 1
        if len(sys.argv) == 4 or len(sys.argv) == 5:
            MODULES = sys.argv[3]
        else:
            l.critical("Runner cannot find the modules specified because 'from' was specified, but the number of arguments was unexpected")
            sys.exit()
    else:
        MODULES = sys.argv[2]

    if ',' in MODULES:
        given_mod_list = MODULES.split(',')
    else:
        given_mod_list = [MODULES]

    # Checking Spelling
    l.info("Checking modules")
    if (to == 1 or fro == 1) and len(given_mod_list) > 1:
        l.critical('Runner cannot determine the modules to be processed.\nReceived:', given_mod_list)
        sys.exit()
    for g in given_mod_list:
        if g == 'all':
            if len(given_mod_list) == 1:
                l.info ('Runner is about to run the full process...')
            else:
                l.critical( "Runner cannot determine the modules to be processed.\nReceived: %s"  % given_mod_list)
                sys.exit()
        elif g not in mod_list:
            l.critical("%s is not a module." % g)
            sys.exit()
        else:
            l.info( "%s module found..." % g)


#(INTERMEDIATE) Managing the fourth (last) argument
    INTERMED = ''
    # if not specified, keep the intermediate files
    if len(sys.argv) == 4:
        keep = 1
    else:
        if len(sys.argv) == 5:
            if to == 1 or fro == 1:
                keep = 1
            #if specified
            else:
                INTERMED = sys.argv[3]
        elif len(sys.argv) == 6:
            INTERMED = sys.argv[4]

    #removes the intermediate files if requested
    if INTERMED == '-r':
        keep = 0


#CALLING THE MAIN FUNCTION

    # if the list of collections given is 'all', run all the collections retrieved in '../data/DDB_TEI_XML/' (stored in pck_list)
    if given_col_list[0] == 'all':
        given_col_list = pck_list

        l.info ("all collections (%s in number) will be processed" % len(given_col_list))

    argtup = (','.join(given_col_list), to, fro, ','.join(given_mod_list),keep)
    l.debug("arguments: to runner.py as follows:\n\tgiven_col_list = %s\n\tto = %s\n\tfro = %s\n\tgiven_mod_list = %s\n\tkeep = %s" % argtup)

    for pck in given_col_list:
        run(pck, to, fro, given_mod_list, keep)


# ----
# END
# ----