Skip to content

Instantly share code, notes, and snippets.

@raffazizzi
Created August 5, 2012 09:14
Show Gist options
  • Save raffazizzi/3263152 to your computer and use it in GitHub Desktop.
Save raffazizzi/3263152 to your computer and use it in GitHub Desktop.
Wrapper for "Integrating Digital Papyrology"

The Integrating Digital Papyrology was an international project that joined together several databases of (mainly) Greek papyri; converted them to Epidoc, a fully conformant TEI P5 flavour. It formed the basis of the editorial project papyri.info.

The source databases had many differences not only in the way the data was structured, but also in the representation of ancient Greek alphabet and editorial intervention. A number of tools were created by the project team (I contributed to the development of some of the tools as well) and I wrote the Python script below to "run" all the programs according to parametrized requirements.

Some of the programs called are XSLTs, which are passed to one Saxon instance through a socket (instead of opening and closing Saxon at every transformation).

# -*- coding: utf-8 -*- #
#
# File: runner.py
#
# Runner for the IDP process
#
# Copyright (C) 2008 by Raffaele Viglianti
# and Centre for Computing in the Humanities, King's College, London.
# Additional contributors' copyright may be designated in individual source files.
# Additional contribution of code to this file by Tom Elliott, copyright (c)
# 2008 by New York University)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# Information about the EpiDoc community can be obtained via
# http://epidoc.sf.net.
""" runner.py
threads together all the processes necessary to convert ddbdp texts from their legacy
formats to epidoc; for instructions on running this script, type:
python runner.py help
note: you may need to first type:
export PYTHONPATH=:$PWD/crosswalker/DDbDP_to_EpiDoc/chetc-py
from the directory containing runner.py (i.e., the epiduke code directory).
"""
import os, os.path, sys, codecs, re, time, datetime, shutil, filecmp, subprocess, socket
import logging as l
# Non-standard modules:
# NB the environment variables for non-standard modules are set by runner.sh
import chetwrap
LHEAD = "################### %s ###################"
# --------------------
# SECTION 1: Functions
# --------------------
# Tools shared by several modules
def walk(inDir, outDir, xslt, level, saxon, outForm, sock):
''' walks recursively through a folder and calls do() on each file excluding .svn folders. With XSLT '''
if sock == None:
levels = ''
for i in range(1, level+1):
levels = levels+'../'
epidukesax = 'lib/epiduke-saxon.jar'
epidukesax = ''.join((levels, epidukesax))
port = PORT
cmd=['java', '-jar', '-Xms256m', '-Xmx1536m']
cmd.append(epidukesax)
cmd.append('--xsl')
cmd.append(xslt)
cmd.append('--port')
cmd.append('%d' % port)
cmd.append('--version')
cmd.append(saxon)
#subprocess disallows waitpid on Windows
#use following line on MacOSX
p = subprocess.Popen(cmd,False)
#p = os.spawnv(os.P_NOWAIT,myjava,cmd)
time.sleep(2) #wait a couple of seconds to give the listener time to fire up
host = '127.0.0.1'
mysock = None
connected = 1
while connected != 0:
try:
mysock = socket.socket(socket.AF_INET , socket.SOCK_STREAM)
connected = mysock.connect_ex((host, port))
except socket.error, (errno, strerror):
print strerror
time.sleep(2)
mysock.sendall("PING\n")
res = mysock.recv(8).rstrip()
print 'ping response: ' + res
else:
mysock = sock
if not os.path.isdir(outDir):
os.mkdir(outDir, 0776)
if inDir[len(inDir)-1]=='/' or inDir[len(inDir)-1]=='\\':
inDir = inDir[:len(inDir)-1]
if outDir[len(outDir)-1]=='/' or outDir[len(outDir)-1]=='\\':
outDir = outDir[:len(outDir)-1]
for f in os.listdir(inDir):
## If the output is html, changes file extension from .xml to .html
if outForm == 'html':
fhtml = f[0:-4]
if not os.path.isdir(inDir+'/'+f) and not '.svn' in f:
do(inDir+'/'+f, outDir+'/'+fhtml+'.html', outDir, xslt, level, saxon, mysock)
elif os.path.isdir(inDir+'/'+f):
walk(inDir+'/'+f, outDir+'/'+f, xslt, level, saxon, outForm, mysock)
else:
if not os.path.isdir(inDir+'/'+f) and not '.svn' in f:
do(inDir+'/'+f, outDir+'/'+f, outDir, xslt, level, saxon, mysock)
elif os.path.isdir(inDir+'/'+f):
walk(inDir+'/'+f, outDir+'/'+f, xslt, level, saxon, outForm, mysock)
if sock == None:
if mysock != None:
done = ''
while done != 'DONE':
mysock.sendall("DONE?\n")
done = mysock.recv(128).rstrip()
time.sleep(2)
print done
mysock.sendall("FINISHED\n")
#use following line on MacOSX
os.waitpid(p.pid,0)
#os.waitpid(p,0)
mysock.close()
## ---
def do(inFile, outFile, outDir, xslt, level, saxon, mysock):
''' applies a xslt to a file. level = how many levels to go up to find lib (saxon) '''
levels = ''
for i in range(1, level+1):
levels = levels+'../'
logfn = '.'.join((xslt[:-4], 'log'))
logfn = ''.join((levels, logfn))
logfn = os.path.abspath(os.path.normcase(logfn))
l.debug("sending transform call %s %s" % (inFile, outFile))
sendstr = "%s %s\n" % (inFile, outFile)
mysock.sendall(sendstr)
## ---
def walk_dt(inDir, switch):
''' walks recursively throug a folder and calls dt() on each file excluding .svn folders. No output folder or XSLT passed through but on or off '''
if inDir[len(inDir)-1]=='/' or inDir[len(inDir)-1]=='\\':
inDir = inDir[:len(inDir)-1]
for f in os.listdir(inDir):
if not os.path.isdir(inDir+'/'+f) and not '.svn' in f:
dt(inDir+'/'+f, switch)
elif os.path.isdir(inDir+'/'+f):
walk_dt(inDir+'/'+f, switch)
## ---
def dt(inFile, switch):
''' comments out dtd, xml and stylesheet declarations or switch them back on '''
infileobj = codecs.open(inFile, 'r', encoding='utf8')
lines = infileobj.readlines()
infileobj.close()
content = u''.join([line for line in lines])
if switch == 'off':
#Find the stylesheet and XML tags, then comment them out.
if re.search('<\?\s*xml (.*?)\?>', content, re.S):
xtag = re.search('<\?\s*xml (.*?)\?>', content, re.S).group(1)
content = re.sub('<\?\s*xml (.*?)\?>', '<!--xml '+xtag+'-->', content, re.S)
if re.search('<\?xml-stylesheet (.*?)\?>', content, re.S):
xtag = re.search('<\?xml-stylesheet (.*?)\?>', content, re.S).group(1)
if re.search('(")\s*$', xtag, re.S):
xtag = re.search('(")\s*$', xtag, re.S).group(1)
content = re.sub('<\?xml-stylesheet (.*?)\?>', '<!--xml-stylesheet '+xtag+'-->', content, re.S)
#Turns off the DTD
dtd_rm = re.compile('<!DOCTYPE (.*?)>', re.S)
if re.search(dtd_rm, content, re.S):
xtag = re.search(dtd_rm, content, re.S).group(1)
content = re.sub(dtd_rm, '<!-- DOCTYPE '+xtag+' -->', content, re.S)
else:
if switch == 'on':
#Turns on the DTD
dtd_on = re.compile('<!-- DOCTYPE (.*?) -->', re.S)
if re.search(dtd_on, content, re.S):
xtag = re.search(dtd_on, content, re.S).group(1)
content = re.sub(dtd_on, '<!DOCTYPE '+xtag+'>', content, re.S)
#Finds the commented out stylesheet and XML tags, replace
#them with active ones.
if re.search('<!--xml (.*?)-->', content, re.S):
xtag = re.search('<!--xml (.*?)-->', content, re.S).group(1)
content = re.sub('<!--xml (.*?)-->', '<?xml '+xtag+'?>', content, re.S)
if re.search('<!--xml-stylesheet (.*?)-->', content, re.S):
xtag = re.search('<!--xml-stylesheet (.*?)-->', content, re.S).group(1)
if re.search('(")\s*$', xtag, re.S):
xtag = re.search('(")\s*$', xtag, re.S).group(1)
content = re.sub('<!--xml-stylesheet (.*?)-->', '<?xml-stylesheet '+xtag+'?>', content, re.S)
outfileobj = codecs.open(inFile, "w", encoding='utf8')
outfileobj.writelines(content)
outfileobj.close()
## ---
def copy_tree(src, dst):
''' copy_tree() copies recursively a directory rooted at src ignoring svn folders. NB: Uses shutil.copyfile() '''
if not os.path.isdir(dst):
os.mkdir(dst, 0777)
if src[len(src)-1]=='/' or src[len(src)-1]=='\\':
src = src[:len(src)-1]
if dst[len(dst)-1]=='/' or dst[len(dst)-1]=='\\':
dst = dst[:len(dst)-1]
for f in os.listdir(src):
if not os.path.isdir(src+'/'+f):
shutil.copyfile(src+'/'+f, dst+'/'+f)
elif os.path.isdir(src+'/'+f) and not '.svn' in f:
copy_tree(src+'/'+f, dst+'/'+f)
## ---
# ------------------
# Section 2: MODULES
# ------------------
# Every call to each part of the process is executed by the following functions
# --- XSLT pre transcoder
# --- (step 0)
#
# Status: NOT IN USE
# Date: 12/05/08
# Change system calls.
# ---
def mod_0_Xtrans(pck):
l.info(LHEAD % 'Pre Transcoder starting')
xsl_pt_in = "../data/DDB_TEI_XML/ddbdp."+pck+".xml"
xsl_pt_out = "osx/xslt/output/ddbdp."+pck+".xml"
os.system('java -jar -Xmx1023m lib/saxon.jar -o '+xsl_pt_out+' '+xsl_pt_in+' osx/xslt/identity-transform.xsl')
l.info(LHEAD % 'Pre Transcoder complete')
# ---
# --- Transcoder
# --- (step 1)
#
# Status: JAVA CALL
# Date: 12/05/08
# ---
def mod_1_trans(pck):
l.info(LHEAD % 'Transcoder starting')
trans_from = "BetaCode"
trans_to = "UnicodeC"
if pck == 'test':
trans_input = "../data/tests/test-input/ddbdp.p.test.xml"
trans_output = "../data/tests/step-01-trans/ddbdp.p.test.xml"
os.mkdir('../data/tests/step-01-trans', 0776)
else:
trans_input = "../data/DDB_TEI_XML/ddbdp."+pck+".xml"
trans_output = "../data/DDB_transcoded_XML/ddbdp."+pck+".xml"
os.system("java -cp lib/transcoder.jar:lib/xercesImpl.jar:lib/xml-apis.jar:lib/serializer.jar:lib/xalan.jar edu.unc.epidoc.transcoder.TransCoder -s "+trans_input+" -o "+trans_output)
l.info(LHEAD % 'Transcoder complete')
# ---
# --- Splitter
# --- (step 2)
#
# Status: JAVA CALL
# Date: 12/05/08
# ---
def mod_2_split(pck):
l.info(LHEAD % 'Splitter starting')
if pck == 'test':
os.mkdir('../data/tests/step-02-split', 0776)
split_input = "../data/tests/step-01-trans/ddbdp.p.test.xml"
split_output = "../data/tests/step-02-split/ddbdp.p.test.xml"
split_param = "ddbdp.p.test"
else:
split_input = "../data/DDB_transcoded_XML/ddbdp."+pck+".xml"
split_output = "crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/ddbdp."+pck+".xml"
split_param = "ddbdp."+pck
os.mkdir('crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/'+pck, 0776)
os.system('java -jar -Xmx1023m lib/saxon9.jar -t -w1 -o '+split_output+' '+split_input+' crosswalker/DDbDP_to_EpiDoc/split/split.xsl "filename='+split_param+'" 2> report_split_'+pck+'.txt')
l.info(LHEAD % 'Splitter complete')
# ---
# --- Chetc
# --- (step 3)
#
# Status: OK
# Date: 12/05/08
# ---
def mod_3_chetc(pck):
l.info(LHEAD % 'CHET-C starting')
os.chdir("crosswalker/DDbDP_to_EpiDoc/chetc-py") # changes directory to chet-c
if pck == 'test':
chetc_input = "../../../../data/tests/step-02-split/p.test"
chetc_output = "../../../../data/tests/step-03-chetc/p.test"
os.mkdir('../../../../data/tests/step-03-chetc', 0776)
else:
chetc_input = "../split/01-transcoded_xml_id/"+pck
chetc_output = "output/"+pck
os.mkdir('output/'+pck, 0776)
chetwrap.walk(chetc_input, chetc_output, 'ddbdp', pck)
#Reports handling
report_name = 'report_'+pck+'.txt'
report_split_name = 'report_split'+pck+'.txt'
report_safety_name = 'report_afterSafety_'+pck+'.txt'
public_location = '/var/www/epiduke/reports/'+curDate+'/'
if not os.path.isdir(public_location):
os.mkdir(public_location, 0776)
do_public = os.path.isdir(public_location)
if do_public:
#split reports
if os.path.isfile(public_location+report_split_name):
os.unlink(public_location+report_split_name)
if os.path.isfile(report_split_name):
shutil.copyfile(report_split_name, public_location+report_split_name)
#Safety Net report
if os.path.isfile(public_location+report_name):
os.unlink(public_location+report_name)
if os.path.isfile(report_name):
shutil.copyfile(report_name, public_location+report_name)
#After Safety Net reports
if os.path.isfile(public_location+report_safety_name):
os.unlink(public_location+report_safety_name)
if os.path.isfile(report_safety_name):
shutil.copyfile(report_safety_name, public_location+report_safety_name)
else:
if do_public:
if os.path.isfile(public_location+report_split_name):
l.error("no public_location (%s), so could not copy %s" % (public_location, report_split_name))
if os.path.isfile(public_location+report_name):
l.error("no public_location (%s), so could not copy %s" % (public_location, report_name))
if os.path.isfile(public_location+report_safety_name):
l.error("no public_location(%s), so could not copy %s" % (public_location, report_safety_name))
# changes directory back to this program's root
os.chdir(BASE_DIR)
l.info(LHEAD % 'CHET-C complete')
# ---
# --- XSLT cleanup
# --- (step 4)
#
# Status: OK
# Date: 12/05/08
# ---
def mod_4_cleanup(pck, g):
# N.B.: if GreekNumConverter is on, cleanup output goes to crosswalker/DDbDP_to_EpiDoc/xslt_cleanup/output
l.info(LHEAD % 'XSLT cleanup starting')
os.chdir("crosswalker/DDbDP_to_EpiDoc/xslt_cleanup") # changes directory to xslt-cleanup
if pck == 'test':
clean_input = "../../../../data/tests/step-03-chetc/p.test/"
if g == 'y':
os.mkdir('../../../../data/tests/step-04-cleanup', 0776)
clean_output = "../../../../data/tests/step-04-cleanup/p.test"
else:
clean_output = "../../../../data/tests/test-output/p.test"
os.mkdir('../../../../data/tests/test-output', 0776)
else:
clean_input = "../chetc-py/output/"+pck+"/"
if g == 'y':
clean_output = "output/"+pck+"/"
if not os.path.isdir("output"):
os.mkdir("output/", 0776)
if not os.path.isdir("output/"+pck+"/"):
os.mkdir("output/"+pck+"/", 0776)
else:
clean_output = "../../../../data/DDB_EpiDoc_XML/"+pck+"/"
walk(clean_input, clean_output, 'xslt_cleanup_s01.xsl', 3, 'xsl2', 'xml', None)
os.chdir(BASE_DIR) # changes directory back to this program's root
l.info(LHEAD % 'XSLT cleanup complete')
# ---
# --- Greek Number Converter (GNC)
# --- (step 5)
#
# Status: OK
# Date: 12/05/08
# ---
def mod_5_gnc(pck):
l.info(LHEAD % 'Greek number converter starting')
os.chdir('greek_num_converter') # changes directory to GNC
if pck == 'test':
gnc_input = "../../data/tests/step-04-cleanup/p.test/"
gnc_output = "../../data/tests/test-output/p.test/"
else:
gnc_input = "../crosswalker/DDbDP_to_EpiDoc/xslt_cleanup/output/"+pck+"/"
gnc_output = "../../data/DDB_EpiDoc_XML/"+pck+"/"
walk_dt(gnc_input, 'off')
walk(gnc_input, gnc_output, 'grc_num_converter.xsl', 1, 'xsl2', 'xml', None)
walk_dt(gnc_input, 'on')
os.chdir(BASE_DIR) # changes directory back to this program's root
l.info(LHEAD % 'Greek Number Converter complete')
# --- HTML tranformation and publication handling
# --- (step 6)
#
# Status: OK
# Date: 12/05/08
# ---
def mod_6_html(pck, h):
l.info(LHEAD % 'Publication starting')
if pck == 'test':
shutil.rmtree('../data/tests/test-html', 1)
else:
if h == 'y':
shutil.rmtree("../data/tests/run-html/"+pck, 1)
if h.startswith('y'):
l.info ('starting HTML transformations')
os.chdir('epidoc_xslt') # changes directory to epidoc_xslt
if pck == 'test':
html_input = "../../data/tests/test-output/p.test/"
html_output = "../../data/tests/test-html/p.test/"
os.mkdir('../../data/tests/test-html', 0776)
else:
html_input = "../../data/DDB_EpiDoc_XML/"+pck+"/"
html_output = "../../data/tests/run-html/"+pck+"/"
walk(html_input, html_output, 'start-edition.xsl', 1, 'xsl1', 'html', None)
l.info('HTML transformations complete')
if h == 'yw':
l.info ('file copy process starting')
dateDir = '/var/www/epiduke/'+curDate
htmlDir = dateDir+'/html'
xmlDir = dateDir+'/xml'
if not os.path.isdir(dateDir):
os.mkdir(dateDir, 0776)
if not os.path.isdir(htmlDir):
os.mkdir(htmlDir, 0776)
if not os.path.isdir(xmlDir):
os.mkdir(xmlDir, 0776)
copy_tree('../../data/tests/run-html/'+pck+'/', "/var/www/epiduke/"+curDate+"/html/"+pck+"/")
copy_tree('../../data/DDB_EpiDoc_XML/'+pck+'/', "/var/www/epiduke/"+curDate+"/xml/"+pck+"/")
#copies collection into batch5 and data for whole process running
shutil.rmtree("/usr/local/epiduke/data/DDB_EpiDoc_XML/"+pck, 1)
copy_tree('../../data/DDB_EpiDoc_XML/'+pck+'/', "/usr/local/epiduke/data/DDB_EpiDoc_XML/"+pck+"/")
if pck == 'test':
copy_tree('../../data/tests/test-html', "/var/www/epiduke/"+curDate+"/html")
copy_tree('../../data/tests/test-output', "/var/www/epiduke/"+curDate+"/xml")
#os.system("cp -r ../../data/tests/test-html/[!.]* /var/www/epiduke/"+curDate+"/html")
#os.system("cp -r ../../data/tests/test-output/[!.]* /var/www/epiduke/"+curDate+"/xml")
l.info ('file copy process complete')
# Copy HTML translation into the current dated directory
copy_tree('/usr/local/epiduke/data/tests/run-html/trans/', "/var/www/epiduke/"+curDate+"/html/trans/")
os.chdir(BASE_DIR) # changes directory back to this program's root
l.info (LHEAD % 'Publication complete')
# ---
# ------------------------
# Section 3: MAIN FUNCTION
# ------------------------
# Calls all the modules requested
def run(pck, to, fro, mods, keep):
if keep == 0:
l.info('Intermediate files will be removed at the end of the process')
#Modules switch
mod_switch = [
["transcoder", 0],
["splitter", 0],
["chetc", 0],
["cleanup", 0],
["gnc", 0],
["html", 0],
["htmlw", 0],
]
#Determine active modules and activates them.
# "Up to"
if to == 1:
for i, m in enumerate(mod_switch):
if mods[0] == m[0]:
#activates final module
m[1] = 1
#activates preceding modules
for e in range(0, i):
mod_switch[e][1] = 1
# "From"
if fro == 1:
for i, m in enumerate(mod_switch):
if mods[0] == m[0]:
#activates starting module
#activates preceding modules
for e in range(i, len(mod_switch)):
mod_switch[e][1] = 1
# One or more
else:
if mods[0] == 'all':
for m in mod_switch:
m[1] = 1
else:
for given in mods:
for m in mod_switch:
if given == m[0]:
m[1] = 1
#If htmlw is on, html is not included in the process
if mod_switch[6][1] == 1:
mod_switch[5][1] == 0
#Run modules
#TRANSCODER
if mod_switch[0][1] == 1:
if pck == 'test':
shutil.rmtree("../data/tests/step-01-trans", 1)
else:
try:
os.unlink("../data/DDB_transcoded_XML/ddbdp."+pck+".xml")
#os.unlink("osx/xslt/output/ddbdp."+pck+".xml")
except os.error:
l.warning("failed: os.unlink('../data/DDB_transcoded_XML/ddbdp.%s.xml')" % pck)
#mod_0_Xtrans(pck)
mod_1_trans(pck)
#SPLITTER
if mod_switch[1][1] == 1:
if pck == 'test':
shutil.rmtree("../data/tests/step-02-split", 1)
else:
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/"+pck, 1)
mod_2_split(pck)
#CHETC
if mod_switch[2][1] == 1:
if pck == 'test':
shutil.rmtree("../data/tests/step-03-chetc", 1)
else:
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/chetc-py/output/"+pck, 1)
mod_3_chetc(pck)
#CLEANUP
if mod_switch[3][1] == 1:
if pck == 'test':
shutil.rmtree("../data/tests/test-output", 1)
shutil.rmtree("../data/tests/step-04-cleanup", 1) #used when GNC is on
else:
shutil.rmtree("../data/DDB_EpiDoc_XML/"+pck, 1)
#if gnc is on...
if mod_switch[4][1] == 1:
g = 'y'
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/xslt_cleanup/output/"+pck, 1)
else:
g ='n'
mod_4_cleanup(pck, g)
#GNC
if mod_switch[4][1] == 1:
mod_5_gnc(pck)
#HTML
if mod_switch[5][1] == 1:
if pck == 'test':
shutil.rmtree("../data/tests/test-html", 1)
else:
shutil.rmtree("../data/tests/run-html/"+pck, 1)
mod_6_html(pck, 'y')
#HTMLW
if mod_switch[6][1] == 1:
if pck == 'test':
shutil.rmtree("../data/tests/test-html", 1)
mod_6_html(pck, 'yw')
# If intermediate files must be removed... (To be cleaned up a little bit, but working)
if keep == 0:
if pck == 'test':
shutil.rmtree("../data/tests/step-01-trans", 1)
else:
try:
os.unlink("../data/DDB_transcoded_XML/ddbdp."+pck+".xml")
os.unlink("osx/xslt/output/ddbdp."+pck+".xml")
except os.error:
l.warning("failed: os.unlink('../data/DDB_transcoded_XML/ddbdp.%s.xml')" % pck)
if pck == 'test':
shutil.rmtree("../data/tests/step-02-split", 1)
else:
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/"+pck, 1)
if pck == 'test':
shutil.rmtree("../data/tests/step-03-chetc", 1)
else:
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/chetc-py/output/"+pck, 1)
if pck == 'test':
shutil.rmtree("../data/tests/test-html", 1)
else:
shutil.rmtree("../data/tests/run-html/"+pck, 1)
#Change permissions on unix machines
#Change group to epiduke and make the files group writable.
#~ os.system("chgrp -R -f epiduke /usr/local/epiduke/*")
#~ os.system("chgrp -R -f epiduke /var/www/epiduke/*")
#~ os.system("chmod -R -f g+w /usr/local/epiduke/*")
#~ os.system("chmod -R -f g+w /var/www/epiduke/*")
#COMMENTED OUT AND MOVED TO RUNNER.SH (RV 14/05/08)
#Switching off all modules:
for m in mod_switch:
m[1] = 0
l.info (LHEAD % ('Runner complete for collection: ' + pck))
# ---
# ---
# ---------------------
# Section 0: runner.py
# ---------------------
# Manages the options and calls run()
# To do:
# - Subversion?
# - Log
if __name__ == "__main__":
#configure logger
l.basicConfig(level=l.DEBUG)
#BASE DIRECTORY
BASE_DIR = os.path.abspath(os.path.dirname(sys.argv[0]))
l.info ("BASE_DIR is %s" % BASE_DIR)
#INPUT:
root_input = '../data/DDB_TEI_XML/'
#COLLECTIONS LIST
pck_list = []
for root, dirs, files in os.walk('../data/DDB_TEI_XML/'):
for f in files:
m = re.search('^ddbdp\.(.*?)\.xml$', f)
if m:
collection = m.group(1)
pck_list.append(collection)
if len(pck_list) == 0:
l.critical("Runner cannot find the collections.\nPlease check that the collections are in %s" % root_input)
sys.exit()
#MODULES DICTIONARY
mod_list = [ "transcoder", "splitter", "chetc", "cleanup", "gnc", "html", "htmlw" ]
#CURRENT DATE
curDate = str(datetime.date.today())
#USAGE
usage = '''\nRunner usage:\npython runner.py [col[,col] | all] [[to | from] mod | [mod[,mod] | all] [-r] [port]\n
Specify one collection or a list of collections separated by commas.\n
Specify 'to' and ONE module to run the process up to that module
or specify a list of modules separated by commas.\n
Specify -r to remove intermediate files at the end of the process\n
State the port, it must begin with '999' and be a number\n
python runner.py [help | h | ?] of for further information\n\n
'''
#HELP
help = '''\nRunner usage:\npython runner.py [col[,col] | all] [[to | from] mod | [mod[,mod] | all] [-r] [port]\n
Specify one collection or a list of collections separated by commas.\n
Specify 'to' and ONE module to run the process up to that module
or specify a list of modules separated by commas.\n
Specify -r to remove intermediate files at the end of the process\n
State the port, it must begin with '999' and be a number\n
python runner.py [help | h | ?] of for this help\n
Available modules:\n
transcoder Runs the transcoder (Java)
splitter Runs splitter (XLST)
chetc Runs CHET-C converter (Python)
cleanup Runs CHET-C cleanup (XSLT)
gnc Runs Greek Number Converter (XSLT)
html Runs the HTML output (XSLT)
htmlw Runs the HTML output and copies XML\n
To run one or more modules:
python runner.py [pck[,pck] | all] [mod[,mod]] [port]\n
To run the process up to a module:
python runner.py [pck[,pck] | all] to [mod] [port]\n
To run the process starting from a module:
python runner.py [pck[,pck] | all] from [mod] [port]\n
To run the complete process:
python runner.py [pck[,pck] | all] all [port]\n
'''
# Checking arguments...
if sys.argv[1] == 'help' or sys.argv[1] == 'h' or sys.argv[1] == '?':
print help
sys.exit()
if len(sys.argv) > 6 or len(sys.argv) < 3:
print usage
sys.exit()
#(PORT) Managing the last argument
PORT = sys.argv[len(sys.argv)-1]
if not PORT.startswith('999') :
print usage
sys.exit()
PORT = int(PORT)
#(COLLECTIONS) Managing the first argument
COLLECTIONS = sys.argv[1]
if ',' in COLLECTIONS:
given_col_list = COLLECTIONS.split(',')
else:
given_col_list = [COLLECTIONS]
#Checking spelling
l.info('Checking Collections...')
for g in given_col_list:
if g == 'all':
if len(given_col_list) == 1:
l.info('Runner is about to process all the collections...')
else:
l.critical("Runner cannot determine the collections to be processed.\nReceived: %s" % given_col_list)
sys.exit()
elif g == 'test':
l.info ("%s collection found..." % g)
elif g not in pck_list:
l.critical ("%s is not present" % g)
sys.exit()
else:
l.info ("%s collection found..." % g)
#(MODULES) Managing the second and third arguments
# Checking if the process has to ben run "up to" or "from" a certain module
# or if it has to run certain modules only.
to = 0
fro = 0
if sys.argv[2].lower() == 'to':
to = 1
if len(sys.argv) == 4 or len(sys.argv) == 5:
MODULES = sys.argv[3]
else:
l.critical("Runner cannot find the modules specified because 'to' was specified, but the number of arguments was unexpected")
sys.exit()
elif sys.argv[2].lower() == 'from':
fro = 1
if len(sys.argv) == 4 or len(sys.argv) == 5:
MODULES = sys.argv[3]
else:
l.critical("Runner cannot find the modules specified because 'from' was specified, but the number of arguments was unexpected")
sys.exit()
else:
MODULES = sys.argv[2]
if ',' in MODULES:
given_mod_list = MODULES.split(',')
else:
given_mod_list = [MODULES]
# Checking Spelling
l.info("Checking modules")
if (to == 1 or fro == 1) and len(given_mod_list) > 1:
l.critical('Runner cannot determine the modules to be processed.\nReceived:', given_mod_list)
sys.exit()
for g in given_mod_list:
if g == 'all':
if len(given_mod_list) == 1:
l.info ('Runner is about to run the full process...')
else:
l.critical( "Runner cannot determine the modules to be processed.\nReceived: %s" % given_mod_list)
sys.exit()
elif g not in mod_list:
l.critical("%s is not a module." % g)
sys.exit()
else:
l.info( "%s module found..." % g)
#(INTERMEDIATE) Managing the fourth (last) argument
INTERMED = ''
# if not specified, keep the intermediate files
if len(sys.argv) == 4:
keep = 1
else:
if len(sys.argv) == 5:
if to == 1 or fro == 1:
keep = 1
#if specified
else:
INTERMED = sys.argv[3]
elif len(sys.argv) == 6:
INTERMED = sys.argv[4]
#removes the intermediate files if requested
if INTERMED == '-r':
keep = 0
#CALLING THE MAIN FUNCTION
# if the list of collections given is 'all', run all the collections retrieved in '../data/DDB_TEI_XML/' (stored in pck_list)
if given_col_list[0] == 'all':
given_col_list = pck_list
l.info ("all collections (%s in number) will be processed" % len(given_col_list))
argtup = (','.join(given_col_list), to, fro, ','.join(given_mod_list),keep)
l.debug("arguments: to runner.py as follows:\n\tgiven_col_list = %s\n\tto = %s\n\tfro = %s\n\tgiven_mod_list = %s\n\tkeep = %s" % argtup)
for pck in given_col_list:
run(pck, to, fro, given_mod_list, keep)
# ----
# END
# ----
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment