|
# -*- coding: utf-8 -*- # |
|
# |
|
# File: runner.py |
|
# |
|
# Runner for the IDP process |
|
# |
|
# Copyright (C) 2008 by Raffaele Viglianti |
|
# and Centre for Computing in the Humanities, King's College, London. |
|
# Additional contributors' copyright may be designated in individual source files. |
|
# Additional contribution of code to this file by Tom Elliott, copyright (c) |
|
# 2008 by New York University) |
|
# |
|
# This program is free software; you can redistribute it and/or |
|
# modify it under the terms of the GNU General Public License |
|
# as published by the Free Software Foundation; either version 2 |
|
# of the License, or (at your option) any later version. |
|
# |
|
# This program is distributed in the hope that it will be useful, |
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
# GNU General Public License for more details. |
|
# |
|
# You should have received a copy of the GNU General Public License |
|
# along with this program; if not, write to the Free Software |
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
|
# |
|
# Information about the EpiDoc community can be obtained via |
|
# http://epidoc.sf.net. |
|
|
|
""" runner.py |
|
|
|
threads together all the processes necessary to convert ddbdp texts from their legacy |
|
formats to epidoc; for instructions on running this script, type: |
|
|
|
python runner.py help |
|
|
|
note: you may need to first type: |
|
|
|
export PYTHONPATH=:$PWD/crosswalker/DDbDP_to_EpiDoc/chetc-py |
|
|
|
from the directory containing runner.py (i.e., the epiduke code directory). |
|
""" |
|
|
|
import os, os.path, sys, codecs, re, time, datetime, shutil, filecmp, subprocess, socket |
|
import logging as l |
|
# Non-standard modules: |
|
# NB the environment variables for non-standard modules are set by runner.sh |
|
import chetwrap |
|
|
|
LHEAD = "################### %s ###################" |
|
|
|
# -------------------- |
|
# SECTION 1: Functions |
|
# -------------------- |
|
# Tools shared by several modules |
|
|
|
|
|
def walk(inDir, outDir, xslt, level, saxon, outForm, sock): |
|
''' walks recursively through a folder and calls do() on each file excluding .svn folders. With XSLT ''' |
|
|
|
if sock == None: |
|
levels = '' |
|
for i in range(1, level+1): |
|
levels = levels+'../' |
|
|
|
epidukesax = 'lib/epiduke-saxon.jar' |
|
epidukesax = ''.join((levels, epidukesax)) |
|
port = PORT |
|
cmd=['java', '-jar', '-Xms256m', '-Xmx1536m'] |
|
cmd.append(epidukesax) |
|
cmd.append('--xsl') |
|
cmd.append(xslt) |
|
cmd.append('--port') |
|
cmd.append('%d' % port) |
|
cmd.append('--version') |
|
cmd.append(saxon) |
|
#subprocess disallows waitpid on Windows |
|
#use following line on MacOSX |
|
p = subprocess.Popen(cmd,False) |
|
#p = os.spawnv(os.P_NOWAIT,myjava,cmd) |
|
time.sleep(2) #wait a couple of seconds to give the listener time to fire up |
|
host = '127.0.0.1' |
|
mysock = None |
|
connected = 1 |
|
while connected != 0: |
|
try: |
|
mysock = socket.socket(socket.AF_INET , socket.SOCK_STREAM) |
|
connected = mysock.connect_ex((host, port)) |
|
except socket.error, (errno, strerror): |
|
print strerror |
|
time.sleep(2) |
|
mysock.sendall("PING\n") |
|
res = mysock.recv(8).rstrip() |
|
print 'ping response: ' + res |
|
else: |
|
mysock = sock |
|
|
|
if not os.path.isdir(outDir): |
|
os.mkdir(outDir, 0776) |
|
|
|
if inDir[len(inDir)-1]=='/' or inDir[len(inDir)-1]=='\\': |
|
inDir = inDir[:len(inDir)-1] |
|
|
|
if outDir[len(outDir)-1]=='/' or outDir[len(outDir)-1]=='\\': |
|
outDir = outDir[:len(outDir)-1] |
|
|
|
for f in os.listdir(inDir): |
|
## If the output is html, changes file extension from .xml to .html |
|
if outForm == 'html': |
|
fhtml = f[0:-4] |
|
|
|
if not os.path.isdir(inDir+'/'+f) and not '.svn' in f: |
|
do(inDir+'/'+f, outDir+'/'+fhtml+'.html', outDir, xslt, level, saxon, mysock) |
|
elif os.path.isdir(inDir+'/'+f): |
|
walk(inDir+'/'+f, outDir+'/'+f, xslt, level, saxon, outForm, mysock) |
|
else: |
|
if not os.path.isdir(inDir+'/'+f) and not '.svn' in f: |
|
do(inDir+'/'+f, outDir+'/'+f, outDir, xslt, level, saxon, mysock) |
|
elif os.path.isdir(inDir+'/'+f): |
|
walk(inDir+'/'+f, outDir+'/'+f, xslt, level, saxon, outForm, mysock) |
|
|
|
if sock == None: |
|
if mysock != None: |
|
done = '' |
|
while done != 'DONE': |
|
mysock.sendall("DONE?\n") |
|
done = mysock.recv(128).rstrip() |
|
time.sleep(2) |
|
print done |
|
mysock.sendall("FINISHED\n") |
|
#use following line on MacOSX |
|
os.waitpid(p.pid,0) |
|
#os.waitpid(p,0) |
|
mysock.close() |
|
## --- |
|
|
|
def do(inFile, outFile, outDir, xslt, level, saxon, mysock): |
|
''' applies a xslt to a file. level = how many levels to go up to find lib (saxon) ''' |
|
|
|
levels = '' |
|
for i in range(1, level+1): |
|
levels = levels+'../' |
|
logfn = '.'.join((xslt[:-4], 'log')) |
|
logfn = ''.join((levels, logfn)) |
|
logfn = os.path.abspath(os.path.normcase(logfn)) |
|
l.debug("sending transform call %s %s" % (inFile, outFile)) |
|
sendstr = "%s %s\n" % (inFile, outFile) |
|
mysock.sendall(sendstr) |
|
|
|
|
|
## --- |
|
|
|
def walk_dt(inDir, switch): |
|
''' walks recursively throug a folder and calls dt() on each file excluding .svn folders. No output folder or XSLT passed through but on or off ''' |
|
|
|
if inDir[len(inDir)-1]=='/' or inDir[len(inDir)-1]=='\\': |
|
inDir = inDir[:len(inDir)-1] |
|
|
|
for f in os.listdir(inDir): |
|
if not os.path.isdir(inDir+'/'+f) and not '.svn' in f: |
|
dt(inDir+'/'+f, switch) |
|
elif os.path.isdir(inDir+'/'+f): |
|
walk_dt(inDir+'/'+f, switch) |
|
|
|
## --- |
|
|
|
def dt(inFile, switch): |
|
''' comments out dtd, xml and stylesheet declarations or switch them back on ''' |
|
|
|
infileobj = codecs.open(inFile, 'r', encoding='utf8') |
|
lines = infileobj.readlines() |
|
infileobj.close() |
|
|
|
content = u''.join([line for line in lines]) |
|
|
|
if switch == 'off': |
|
|
|
#Find the stylesheet and XML tags, then comment them out. |
|
if re.search('<\?\s*xml (.*?)\?>', content, re.S): |
|
xtag = re.search('<\?\s*xml (.*?)\?>', content, re.S).group(1) |
|
content = re.sub('<\?\s*xml (.*?)\?>', '<!--xml '+xtag+'-->', content, re.S) |
|
|
|
if re.search('<\?xml-stylesheet (.*?)\?>', content, re.S): |
|
xtag = re.search('<\?xml-stylesheet (.*?)\?>', content, re.S).group(1) |
|
if re.search('(")\s*$', xtag, re.S): |
|
xtag = re.search('(")\s*$', xtag, re.S).group(1) |
|
content = re.sub('<\?xml-stylesheet (.*?)\?>', '<!--xml-stylesheet '+xtag+'-->', content, re.S) |
|
|
|
#Turns off the DTD |
|
dtd_rm = re.compile('<!DOCTYPE (.*?)>', re.S) |
|
if re.search(dtd_rm, content, re.S): |
|
xtag = re.search(dtd_rm, content, re.S).group(1) |
|
content = re.sub(dtd_rm, '<!-- DOCTYPE '+xtag+' -->', content, re.S) |
|
|
|
else: |
|
if switch == 'on': |
|
|
|
#Turns on the DTD |
|
dtd_on = re.compile('<!-- DOCTYPE (.*?) -->', re.S) |
|
if re.search(dtd_on, content, re.S): |
|
xtag = re.search(dtd_on, content, re.S).group(1) |
|
content = re.sub(dtd_on, '<!DOCTYPE '+xtag+'>', content, re.S) |
|
|
|
#Finds the commented out stylesheet and XML tags, replace |
|
#them with active ones. |
|
if re.search('<!--xml (.*?)-->', content, re.S): |
|
xtag = re.search('<!--xml (.*?)-->', content, re.S).group(1) |
|
content = re.sub('<!--xml (.*?)-->', '<?xml '+xtag+'?>', content, re.S) |
|
|
|
if re.search('<!--xml-stylesheet (.*?)-->', content, re.S): |
|
xtag = re.search('<!--xml-stylesheet (.*?)-->', content, re.S).group(1) |
|
if re.search('(")\s*$', xtag, re.S): |
|
xtag = re.search('(")\s*$', xtag, re.S).group(1) |
|
content = re.sub('<!--xml-stylesheet (.*?)-->', '<?xml-stylesheet '+xtag+'?>', content, re.S) |
|
|
|
outfileobj = codecs.open(inFile, "w", encoding='utf8') |
|
outfileobj.writelines(content) |
|
outfileobj.close() |
|
## --- |
|
|
|
def copy_tree(src, dst): |
|
''' copy_tree() copies recursively a directory rooted at src ignoring svn folders. NB: Uses shutil.copyfile() ''' |
|
|
|
if not os.path.isdir(dst): |
|
os.mkdir(dst, 0777) |
|
|
|
if src[len(src)-1]=='/' or src[len(src)-1]=='\\': |
|
src = src[:len(src)-1] |
|
|
|
if dst[len(dst)-1]=='/' or dst[len(dst)-1]=='\\': |
|
dst = dst[:len(dst)-1] |
|
|
|
for f in os.listdir(src): |
|
|
|
if not os.path.isdir(src+'/'+f): |
|
shutil.copyfile(src+'/'+f, dst+'/'+f) |
|
elif os.path.isdir(src+'/'+f) and not '.svn' in f: |
|
copy_tree(src+'/'+f, dst+'/'+f) |
|
|
|
## --- |
|
|
|
# ------------------ |
|
# Section 2: MODULES |
|
# ------------------ |
|
# Every call to each part of the process is executed by the following functions |
|
|
|
|
|
# --- XSLT pre transcoder |
|
# --- (step 0) |
|
# |
|
# Status: NOT IN USE |
|
# Date: 12/05/08 |
|
# Change system calls. |
|
# --- |
|
def mod_0_Xtrans(pck): |
|
|
|
l.info(LHEAD % 'Pre Transcoder starting') |
|
xsl_pt_in = "../data/DDB_TEI_XML/ddbdp."+pck+".xml" |
|
xsl_pt_out = "osx/xslt/output/ddbdp."+pck+".xml" |
|
os.system('java -jar -Xmx1023m lib/saxon.jar -o '+xsl_pt_out+' '+xsl_pt_in+' osx/xslt/identity-transform.xsl') |
|
l.info(LHEAD % 'Pre Transcoder complete') |
|
|
|
# --- |
|
|
|
# --- Transcoder |
|
# --- (step 1) |
|
# |
|
# Status: JAVA CALL |
|
# Date: 12/05/08 |
|
# --- |
|
def mod_1_trans(pck): |
|
|
|
l.info(LHEAD % 'Transcoder starting') |
|
trans_from = "BetaCode" |
|
trans_to = "UnicodeC" |
|
|
|
if pck == 'test': |
|
trans_input = "../data/tests/test-input/ddbdp.p.test.xml" |
|
trans_output = "../data/tests/step-01-trans/ddbdp.p.test.xml" |
|
os.mkdir('../data/tests/step-01-trans', 0776) |
|
else: |
|
trans_input = "../data/DDB_TEI_XML/ddbdp."+pck+".xml" |
|
trans_output = "../data/DDB_transcoded_XML/ddbdp."+pck+".xml" |
|
|
|
os.system("java -cp lib/transcoder.jar:lib/xercesImpl.jar:lib/xml-apis.jar:lib/serializer.jar:lib/xalan.jar edu.unc.epidoc.transcoder.TransCoder -s "+trans_input+" -o "+trans_output) |
|
|
|
l.info(LHEAD % 'Transcoder complete') |
|
# --- |
|
|
|
# --- Splitter |
|
# --- (step 2) |
|
# |
|
# Status: JAVA CALL |
|
# Date: 12/05/08 |
|
# --- |
|
def mod_2_split(pck): |
|
|
|
l.info(LHEAD % 'Splitter starting') |
|
if pck == 'test': |
|
os.mkdir('../data/tests/step-02-split', 0776) |
|
split_input = "../data/tests/step-01-trans/ddbdp.p.test.xml" |
|
split_output = "../data/tests/step-02-split/ddbdp.p.test.xml" |
|
split_param = "ddbdp.p.test" |
|
else: |
|
split_input = "../data/DDB_transcoded_XML/ddbdp."+pck+".xml" |
|
split_output = "crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/ddbdp."+pck+".xml" |
|
split_param = "ddbdp."+pck |
|
os.mkdir('crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/'+pck, 0776) |
|
|
|
os.system('java -jar -Xmx1023m lib/saxon9.jar -t -w1 -o '+split_output+' '+split_input+' crosswalker/DDbDP_to_EpiDoc/split/split.xsl "filename='+split_param+'" 2> report_split_'+pck+'.txt') |
|
|
|
l.info(LHEAD % 'Splitter complete') |
|
# --- |
|
|
|
# --- Chetc |
|
# --- (step 3) |
|
# |
|
# Status: OK |
|
# Date: 12/05/08 |
|
# --- |
|
def mod_3_chetc(pck): |
|
|
|
l.info(LHEAD % 'CHET-C starting') |
|
os.chdir("crosswalker/DDbDP_to_EpiDoc/chetc-py") # changes directory to chet-c |
|
|
|
if pck == 'test': |
|
chetc_input = "../../../../data/tests/step-02-split/p.test" |
|
chetc_output = "../../../../data/tests/step-03-chetc/p.test" |
|
os.mkdir('../../../../data/tests/step-03-chetc', 0776) |
|
else: |
|
chetc_input = "../split/01-transcoded_xml_id/"+pck |
|
chetc_output = "output/"+pck |
|
os.mkdir('output/'+pck, 0776) |
|
|
|
chetwrap.walk(chetc_input, chetc_output, 'ddbdp', pck) |
|
|
|
|
|
#Reports handling |
|
report_name = 'report_'+pck+'.txt' |
|
report_split_name = 'report_split'+pck+'.txt' |
|
report_safety_name = 'report_afterSafety_'+pck+'.txt' |
|
public_location = '/var/www/epiduke/reports/'+curDate+'/' |
|
if not os.path.isdir(public_location): |
|
os.mkdir(public_location, 0776) |
|
|
|
do_public = os.path.isdir(public_location) |
|
|
|
if do_public: |
|
#split reports |
|
if os.path.isfile(public_location+report_split_name): |
|
os.unlink(public_location+report_split_name) |
|
if os.path.isfile(report_split_name): |
|
shutil.copyfile(report_split_name, public_location+report_split_name) |
|
|
|
|
|
#Safety Net report |
|
if os.path.isfile(public_location+report_name): |
|
os.unlink(public_location+report_name) |
|
if os.path.isfile(report_name): |
|
shutil.copyfile(report_name, public_location+report_name) |
|
|
|
|
|
|
|
#After Safety Net reports |
|
if os.path.isfile(public_location+report_safety_name): |
|
os.unlink(public_location+report_safety_name) |
|
if os.path.isfile(report_safety_name): |
|
shutil.copyfile(report_safety_name, public_location+report_safety_name) |
|
|
|
else: |
|
if do_public: |
|
if os.path.isfile(public_location+report_split_name): |
|
l.error("no public_location (%s), so could not copy %s" % (public_location, report_split_name)) |
|
|
|
if os.path.isfile(public_location+report_name): |
|
l.error("no public_location (%s), so could not copy %s" % (public_location, report_name)) |
|
|
|
if os.path.isfile(public_location+report_safety_name): |
|
l.error("no public_location(%s), so could not copy %s" % (public_location, report_safety_name)) |
|
|
|
|
|
# changes directory back to this program's root |
|
os.chdir(BASE_DIR) |
|
|
|
l.info(LHEAD % 'CHET-C complete') |
|
# --- |
|
|
|
# --- XSLT cleanup |
|
# --- (step 4) |
|
# |
|
# Status: OK |
|
# Date: 12/05/08 |
|
# --- |
|
def mod_4_cleanup(pck, g): |
|
|
|
# N.B.: if GreekNumConverter is on, cleanup output goes to crosswalker/DDbDP_to_EpiDoc/xslt_cleanup/output |
|
|
|
l.info(LHEAD % 'XSLT cleanup starting') |
|
|
|
os.chdir("crosswalker/DDbDP_to_EpiDoc/xslt_cleanup") # changes directory to xslt-cleanup |
|
|
|
if pck == 'test': |
|
clean_input = "../../../../data/tests/step-03-chetc/p.test/" |
|
if g == 'y': |
|
os.mkdir('../../../../data/tests/step-04-cleanup', 0776) |
|
clean_output = "../../../../data/tests/step-04-cleanup/p.test" |
|
else: |
|
clean_output = "../../../../data/tests/test-output/p.test" |
|
os.mkdir('../../../../data/tests/test-output', 0776) |
|
else: |
|
clean_input = "../chetc-py/output/"+pck+"/" |
|
if g == 'y': |
|
clean_output = "output/"+pck+"/" |
|
if not os.path.isdir("output"): |
|
os.mkdir("output/", 0776) |
|
if not os.path.isdir("output/"+pck+"/"): |
|
os.mkdir("output/"+pck+"/", 0776) |
|
else: |
|
clean_output = "../../../../data/DDB_EpiDoc_XML/"+pck+"/" |
|
|
|
walk(clean_input, clean_output, 'xslt_cleanup_s01.xsl', 3, 'xsl2', 'xml', None) |
|
|
|
os.chdir(BASE_DIR) # changes directory back to this program's root |
|
|
|
l.info(LHEAD % 'XSLT cleanup complete') |
|
# --- |
|
|
|
# --- Greek Number Converter (GNC) |
|
# --- (step 5) |
|
# |
|
# Status: OK |
|
# Date: 12/05/08 |
|
# --- |
|
def mod_5_gnc(pck): |
|
|
|
l.info(LHEAD % 'Greek number converter starting') |
|
|
|
os.chdir('greek_num_converter') # changes directory to GNC |
|
|
|
if pck == 'test': |
|
gnc_input = "../../data/tests/step-04-cleanup/p.test/" |
|
gnc_output = "../../data/tests/test-output/p.test/" |
|
else: |
|
gnc_input = "../crosswalker/DDbDP_to_EpiDoc/xslt_cleanup/output/"+pck+"/" |
|
gnc_output = "../../data/DDB_EpiDoc_XML/"+pck+"/" |
|
|
|
walk_dt(gnc_input, 'off') |
|
|
|
walk(gnc_input, gnc_output, 'grc_num_converter.xsl', 1, 'xsl2', 'xml', None) |
|
|
|
walk_dt(gnc_input, 'on') |
|
|
|
os.chdir(BASE_DIR) # changes directory back to this program's root |
|
|
|
l.info(LHEAD % 'Greek Number Converter complete') |
|
|
|
# --- HTML tranformation and publication handling |
|
# --- (step 6) |
|
# |
|
# Status: OK |
|
# Date: 12/05/08 |
|
# --- |
|
def mod_6_html(pck, h): |
|
|
|
l.info(LHEAD % 'Publication starting') |
|
|
|
if pck == 'test': |
|
shutil.rmtree('../data/tests/test-html', 1) |
|
else: |
|
if h == 'y': |
|
shutil.rmtree("../data/tests/run-html/"+pck, 1) |
|
|
|
if h.startswith('y'): |
|
|
|
l.info ('starting HTML transformations') |
|
|
|
os.chdir('epidoc_xslt') # changes directory to epidoc_xslt |
|
|
|
if pck == 'test': |
|
html_input = "../../data/tests/test-output/p.test/" |
|
html_output = "../../data/tests/test-html/p.test/" |
|
os.mkdir('../../data/tests/test-html', 0776) |
|
else: |
|
html_input = "../../data/DDB_EpiDoc_XML/"+pck+"/" |
|
html_output = "../../data/tests/run-html/"+pck+"/" |
|
|
|
walk(html_input, html_output, 'start-edition.xsl', 1, 'xsl1', 'html', None) |
|
|
|
l.info('HTML transformations complete') |
|
|
|
|
|
if h == 'yw': |
|
|
|
l.info ('file copy process starting') |
|
|
|
dateDir = '/var/www/epiduke/'+curDate |
|
htmlDir = dateDir+'/html' |
|
xmlDir = dateDir+'/xml' |
|
if not os.path.isdir(dateDir): |
|
os.mkdir(dateDir, 0776) |
|
if not os.path.isdir(htmlDir): |
|
os.mkdir(htmlDir, 0776) |
|
if not os.path.isdir(xmlDir): |
|
os.mkdir(xmlDir, 0776) |
|
|
|
|
|
|
|
copy_tree('../../data/tests/run-html/'+pck+'/', "/var/www/epiduke/"+curDate+"/html/"+pck+"/") |
|
copy_tree('../../data/DDB_EpiDoc_XML/'+pck+'/', "/var/www/epiduke/"+curDate+"/xml/"+pck+"/") |
|
|
|
#copies collection into batch5 and data for whole process running |
|
shutil.rmtree("/usr/local/epiduke/data/DDB_EpiDoc_XML/"+pck, 1) |
|
copy_tree('../../data/DDB_EpiDoc_XML/'+pck+'/', "/usr/local/epiduke/data/DDB_EpiDoc_XML/"+pck+"/") |
|
|
|
if pck == 'test': |
|
|
|
copy_tree('../../data/tests/test-html', "/var/www/epiduke/"+curDate+"/html") |
|
copy_tree('../../data/tests/test-output', "/var/www/epiduke/"+curDate+"/xml") |
|
|
|
#os.system("cp -r ../../data/tests/test-html/[!.]* /var/www/epiduke/"+curDate+"/html") |
|
#os.system("cp -r ../../data/tests/test-output/[!.]* /var/www/epiduke/"+curDate+"/xml") |
|
|
|
l.info ('file copy process complete') |
|
# Copy HTML translation into the current dated directory |
|
copy_tree('/usr/local/epiduke/data/tests/run-html/trans/', "/var/www/epiduke/"+curDate+"/html/trans/") |
|
|
|
os.chdir(BASE_DIR) # changes directory back to this program's root |
|
|
|
|
|
l.info (LHEAD % 'Publication complete') |
|
# --- |
|
|
|
|
|
# ------------------------ |
|
# Section 3: MAIN FUNCTION |
|
# ------------------------ |
|
# Calls all the modules requested |
|
|
|
def run(pck, to, fro, mods, keep): |
|
|
|
if keep == 0: |
|
l.info('Intermediate files will be removed at the end of the process') |
|
|
|
#Modules switch |
|
mod_switch = [ |
|
["transcoder", 0], |
|
["splitter", 0], |
|
["chetc", 0], |
|
["cleanup", 0], |
|
["gnc", 0], |
|
["html", 0], |
|
["htmlw", 0], |
|
] |
|
|
|
#Determine active modules and activates them. |
|
# "Up to" |
|
if to == 1: |
|
for i, m in enumerate(mod_switch): |
|
if mods[0] == m[0]: |
|
#activates final module |
|
m[1] = 1 |
|
#activates preceding modules |
|
for e in range(0, i): |
|
mod_switch[e][1] = 1 |
|
# "From" |
|
if fro == 1: |
|
for i, m in enumerate(mod_switch): |
|
if mods[0] == m[0]: |
|
#activates starting module |
|
#activates preceding modules |
|
for e in range(i, len(mod_switch)): |
|
mod_switch[e][1] = 1 |
|
# One or more |
|
else: |
|
if mods[0] == 'all': |
|
for m in mod_switch: |
|
m[1] = 1 |
|
else: |
|
for given in mods: |
|
for m in mod_switch: |
|
if given == m[0]: |
|
m[1] = 1 |
|
|
|
|
|
#If htmlw is on, html is not included in the process |
|
if mod_switch[6][1] == 1: |
|
mod_switch[5][1] == 0 |
|
|
|
#Run modules |
|
|
|
#TRANSCODER |
|
if mod_switch[0][1] == 1: |
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/step-01-trans", 1) |
|
else: |
|
try: |
|
os.unlink("../data/DDB_transcoded_XML/ddbdp."+pck+".xml") |
|
#os.unlink("osx/xslt/output/ddbdp."+pck+".xml") |
|
except os.error: |
|
l.warning("failed: os.unlink('../data/DDB_transcoded_XML/ddbdp.%s.xml')" % pck) |
|
|
|
#mod_0_Xtrans(pck) |
|
mod_1_trans(pck) |
|
|
|
#SPLITTER |
|
if mod_switch[1][1] == 1: |
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/step-02-split", 1) |
|
else: |
|
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/"+pck, 1) |
|
|
|
mod_2_split(pck) |
|
|
|
#CHETC |
|
if mod_switch[2][1] == 1: |
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/step-03-chetc", 1) |
|
else: |
|
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/chetc-py/output/"+pck, 1) |
|
|
|
mod_3_chetc(pck) |
|
|
|
#CLEANUP |
|
if mod_switch[3][1] == 1: |
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/test-output", 1) |
|
shutil.rmtree("../data/tests/step-04-cleanup", 1) #used when GNC is on |
|
else: |
|
shutil.rmtree("../data/DDB_EpiDoc_XML/"+pck, 1) |
|
#if gnc is on... |
|
if mod_switch[4][1] == 1: |
|
g = 'y' |
|
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/xslt_cleanup/output/"+pck, 1) |
|
else: |
|
g ='n' |
|
|
|
mod_4_cleanup(pck, g) |
|
|
|
#GNC |
|
if mod_switch[4][1] == 1: |
|
mod_5_gnc(pck) |
|
|
|
#HTML |
|
if mod_switch[5][1] == 1: |
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/test-html", 1) |
|
else: |
|
shutil.rmtree("../data/tests/run-html/"+pck, 1) |
|
|
|
mod_6_html(pck, 'y') |
|
|
|
#HTMLW |
|
if mod_switch[6][1] == 1: |
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/test-html", 1) |
|
|
|
mod_6_html(pck, 'yw') |
|
|
|
# If intermediate files must be removed... (To be cleaned up a little bit, but working) |
|
if keep == 0: |
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/step-01-trans", 1) |
|
else: |
|
try: |
|
os.unlink("../data/DDB_transcoded_XML/ddbdp."+pck+".xml") |
|
os.unlink("osx/xslt/output/ddbdp."+pck+".xml") |
|
except os.error: |
|
l.warning("failed: os.unlink('../data/DDB_transcoded_XML/ddbdp.%s.xml')" % pck) |
|
|
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/step-02-split", 1) |
|
else: |
|
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/split/01-transcoded_xml_id/"+pck, 1) |
|
|
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/step-03-chetc", 1) |
|
else: |
|
shutil.rmtree("crosswalker/DDbDP_to_EpiDoc/chetc-py/output/"+pck, 1) |
|
|
|
if pck == 'test': |
|
shutil.rmtree("../data/tests/test-html", 1) |
|
else: |
|
shutil.rmtree("../data/tests/run-html/"+pck, 1) |
|
|
|
#Change permissions on unix machines |
|
#Change group to epiduke and make the files group writable. |
|
|
|
#~ os.system("chgrp -R -f epiduke /usr/local/epiduke/*") |
|
#~ os.system("chgrp -R -f epiduke /var/www/epiduke/*") |
|
#~ os.system("chmod -R -f g+w /usr/local/epiduke/*") |
|
#~ os.system("chmod -R -f g+w /var/www/epiduke/*") |
|
#COMMENTED OUT AND MOVED TO RUNNER.SH (RV 14/05/08) |
|
|
|
#Switching off all modules: |
|
for m in mod_switch: |
|
m[1] = 0 |
|
|
|
l.info (LHEAD % ('Runner complete for collection: ' + pck)) |
|
|
|
# --- |
|
# --- |
|
|
|
# --------------------- |
|
# Section 0: runner.py |
|
# --------------------- |
|
# Manages the options and calls run() |
|
# To do: |
|
# - Subversion? |
|
# - Log |
|
|
|
if __name__ == "__main__": |
|
|
|
#configure logger |
|
l.basicConfig(level=l.DEBUG) |
|
|
|
#BASE DIRECTORY |
|
BASE_DIR = os.path.abspath(os.path.dirname(sys.argv[0])) |
|
l.info ("BASE_DIR is %s" % BASE_DIR) |
|
|
|
#INPUT: |
|
root_input = '../data/DDB_TEI_XML/' |
|
|
|
#COLLECTIONS LIST |
|
pck_list = [] |
|
for root, dirs, files in os.walk('../data/DDB_TEI_XML/'): |
|
for f in files: |
|
m = re.search('^ddbdp\.(.*?)\.xml$', f) |
|
if m: |
|
collection = m.group(1) |
|
pck_list.append(collection) |
|
|
|
if len(pck_list) == 0: |
|
l.critical("Runner cannot find the collections.\nPlease check that the collections are in %s" % root_input) |
|
sys.exit() |
|
|
|
#MODULES DICTIONARY |
|
mod_list = [ "transcoder", "splitter", "chetc", "cleanup", "gnc", "html", "htmlw" ] |
|
|
|
#CURRENT DATE |
|
curDate = str(datetime.date.today()) |
|
|
|
#USAGE |
|
usage = '''\nRunner usage:\npython runner.py [col[,col] | all] [[to | from] mod | [mod[,mod] | all] [-r] [port]\n |
|
Specify one collection or a list of collections separated by commas.\n |
|
Specify 'to' and ONE module to run the process up to that module |
|
or specify a list of modules separated by commas.\n |
|
Specify -r to remove intermediate files at the end of the process\n |
|
State the port, it must begin with '999' and be a number\n |
|
python runner.py [help | h | ?] of for further information\n\n |
|
''' |
|
|
|
#HELP |
|
help = '''\nRunner usage:\npython runner.py [col[,col] | all] [[to | from] mod | [mod[,mod] | all] [-r] [port]\n |
|
Specify one collection or a list of collections separated by commas.\n |
|
Specify 'to' and ONE module to run the process up to that module |
|
or specify a list of modules separated by commas.\n |
|
Specify -r to remove intermediate files at the end of the process\n |
|
State the port, it must begin with '999' and be a number\n |
|
python runner.py [help | h | ?] of for this help\n |
|
Available modules:\n |
|
transcoder Runs the transcoder (Java) |
|
splitter Runs splitter (XLST) |
|
chetc Runs CHET-C converter (Python) |
|
cleanup Runs CHET-C cleanup (XSLT) |
|
gnc Runs Greek Number Converter (XSLT) |
|
html Runs the HTML output (XSLT) |
|
htmlw Runs the HTML output and copies XML\n |
|
To run one or more modules: |
|
python runner.py [pck[,pck] | all] [mod[,mod]] [port]\n |
|
To run the process up to a module: |
|
python runner.py [pck[,pck] | all] to [mod] [port]\n |
|
To run the process starting from a module: |
|
python runner.py [pck[,pck] | all] from [mod] [port]\n |
|
To run the complete process: |
|
python runner.py [pck[,pck] | all] all [port]\n |
|
''' |
|
|
|
|
|
# Checking arguments... |
|
if sys.argv[1] == 'help' or sys.argv[1] == 'h' or sys.argv[1] == '?': |
|
print help |
|
sys.exit() |
|
|
|
if len(sys.argv) > 6 or len(sys.argv) < 3: |
|
print usage |
|
sys.exit() |
|
|
|
|
|
|
|
#(PORT) Managing the last argument |
|
PORT = sys.argv[len(sys.argv)-1] |
|
if not PORT.startswith('999') : |
|
print usage |
|
sys.exit() |
|
|
|
PORT = int(PORT) |
|
|
|
#(COLLECTIONS) Managing the first argument |
|
COLLECTIONS = sys.argv[1] |
|
|
|
if ',' in COLLECTIONS: |
|
given_col_list = COLLECTIONS.split(',') |
|
else: |
|
given_col_list = [COLLECTIONS] |
|
|
|
#Checking spelling |
|
l.info('Checking Collections...') |
|
for g in given_col_list: |
|
if g == 'all': |
|
if len(given_col_list) == 1: |
|
l.info('Runner is about to process all the collections...') |
|
else: |
|
l.critical("Runner cannot determine the collections to be processed.\nReceived: %s" % given_col_list) |
|
sys.exit() |
|
elif g == 'test': |
|
l.info ("%s collection found..." % g) |
|
elif g not in pck_list: |
|
l.critical ("%s is not present" % g) |
|
sys.exit() |
|
else: |
|
l.info ("%s collection found..." % g) |
|
|
|
|
|
#(MODULES) Managing the second and third arguments |
|
|
|
# Checking if the process has to ben run "up to" or "from" a certain module |
|
# or if it has to run certain modules only. |
|
to = 0 |
|
fro = 0 |
|
if sys.argv[2].lower() == 'to': |
|
to = 1 |
|
if len(sys.argv) == 4 or len(sys.argv) == 5: |
|
MODULES = sys.argv[3] |
|
else: |
|
l.critical("Runner cannot find the modules specified because 'to' was specified, but the number of arguments was unexpected") |
|
sys.exit() |
|
elif sys.argv[2].lower() == 'from': |
|
fro = 1 |
|
if len(sys.argv) == 4 or len(sys.argv) == 5: |
|
MODULES = sys.argv[3] |
|
else: |
|
l.critical("Runner cannot find the modules specified because 'from' was specified, but the number of arguments was unexpected") |
|
sys.exit() |
|
else: |
|
MODULES = sys.argv[2] |
|
|
|
if ',' in MODULES: |
|
given_mod_list = MODULES.split(',') |
|
else: |
|
given_mod_list = [MODULES] |
|
|
|
# Checking Spelling |
|
l.info("Checking modules") |
|
if (to == 1 or fro == 1) and len(given_mod_list) > 1: |
|
l.critical('Runner cannot determine the modules to be processed.\nReceived:', given_mod_list) |
|
sys.exit() |
|
for g in given_mod_list: |
|
if g == 'all': |
|
if len(given_mod_list) == 1: |
|
l.info ('Runner is about to run the full process...') |
|
else: |
|
l.critical( "Runner cannot determine the modules to be processed.\nReceived: %s" % given_mod_list) |
|
sys.exit() |
|
elif g not in mod_list: |
|
l.critical("%s is not a module." % g) |
|
sys.exit() |
|
else: |
|
l.info( "%s module found..." % g) |
|
|
|
|
|
#(INTERMEDIATE) Managing the fourth (last) argument |
|
INTERMED = '' |
|
# if not specified, keep the intermediate files |
|
if len(sys.argv) == 4: |
|
keep = 1 |
|
else: |
|
if len(sys.argv) == 5: |
|
if to == 1 or fro == 1: |
|
keep = 1 |
|
#if specified |
|
else: |
|
INTERMED = sys.argv[3] |
|
elif len(sys.argv) == 6: |
|
INTERMED = sys.argv[4] |
|
|
|
#removes the intermediate files if requested |
|
if INTERMED == '-r': |
|
keep = 0 |
|
|
|
|
|
#CALLING THE MAIN FUNCTION |
|
|
|
# if the list of collections given is 'all', run all the collections retrieved in '../data/DDB_TEI_XML/' (stored in pck_list) |
|
if given_col_list[0] == 'all': |
|
given_col_list = pck_list |
|
|
|
l.info ("all collections (%s in number) will be processed" % len(given_col_list)) |
|
|
|
argtup = (','.join(given_col_list), to, fro, ','.join(given_mod_list),keep) |
|
l.debug("arguments: to runner.py as follows:\n\tgiven_col_list = %s\n\tto = %s\n\tfro = %s\n\tgiven_mod_list = %s\n\tkeep = %s" % argtup) |
|
|
|
for pck in given_col_list: |
|
run(pck, to, fro, given_mod_list, keep) |
|
|
|
|
|
|
|
|
|
# ---- |
|
# END |
|
# ---- |