Created
October 7, 2014 01:04
-
-
Save zimeon/fc1c2b3db3f0e7febc86 to your computer and use it in GitHub Desktop.
Code for arXiv resync experiment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Update script for arXiv experiment on resync.library.cornell.edu | |
It is assumed that the files for abs and src will have been | |
updated before the script is run. The the sequence is: | |
- build an resourcelist in /arxiv/YYYY-MM-DD/resourcelist.xml etc | |
- build a changelist in /arxiv/YYYY-MM-DD/changelist.xml etc | |
- copy resourcelist and changelist to /arxiv/resourcelist.xml, /arxiv/changelist | |
.xml etc | |
Everything is logged to /arxiv/YYYY-MM-DD/log.txt | |
Designed to be run Monday through Friday in the early hours of the morning. | |
""" | |
import os.path | |
import datetime | |
import subprocess | |
import sys | |
import logging | |
import optparse | |
import ConfigParser | |
def run(cmd, shell=False): | |
"""Do subprocess.call with stderr redirected to stdout""" | |
subprocess.call(cmd, shell=shell) | |
# Get name of config file to read | |
p = optparse.OptionParser( | |
description='daily_update.py script', | |
usage='usage: %prog --config=config_file', | |
add_help_option=True) | |
p.add_option('--config', type=str, action='store', | |
help='Mandatory name of config file') | |
(opts,args) = p.parse_args() | |
if (opts.config is None): | |
raise(Exception("Must specify --config")) | |
##### config | |
config = ConfigParser.ConfigParser() | |
config.read(opts.config) | |
mydir = config.get('resync','mydir') | |
client = config.get('resync','client') | |
paths = config.get('resync','paths') | |
capsdir = config.get('resync','capsdir') | |
capsurl = config.get('resync','capsurl') | |
mapping = config.get('resync','mapping').split(' ') | |
exclude = config.get('resync','exclude').split(' ') | |
logging.basicConfig(level=logging.INFO,format='%(asctime)-15s %(message)s',stream=sys.stdout) | |
##### Find today and date of files from last run | |
today = datetime.date.today().isoformat() | |
logging.info( "Today is %s" % (today) ) | |
d = datetime.date.today() | |
archive_dirs = [] | |
previous = None | |
for n in range(0,100): | |
d -= datetime.timedelta(1) | |
tprevious = d.isoformat() | |
#print "lookin for %s" % (os.path.join(capsdir,previous)) | |
if (os.path.isdir(os.path.join(capsdir,tprevious)) and | |
os.path.isfile(os.path.join(capsdir,tprevious,"resourcelist.xml"))): | |
archive_dirs.append(tprevious) | |
if (previous is None): | |
# get just the first one back | |
previous=tprevious | |
logging.info( "Previous run: %s" % (previous) ) | |
logging.info( "Archives: %s" % (' '.join(archive_dirs)) ) | |
##### Record capabilities as we go | |
capabilitlylist_link = capsurl + '/capabilitylist.xml' | |
capl = {} | |
##### Build a resourcelist in /arxiv/YYYY-MM-DD/resourcelist.xml etc | |
today_dir = os.path.join(capsdir,today) | |
if (not os.path.isdir(today_dir)): | |
os.mkdir(today_dir) | |
resourcelist_filename = os.path.join(capsdir,today,'resourcelist.xml') | |
logfile = os.path.join(capsdir,today,'log.txt') | |
cmd = [ client, '-v', | |
'--logger', '--logfile', logfile, | |
'--checksum', | |
'--resourcelist', | |
'--outfile', resourcelist_filename, | |
'--paths', paths, | |
'--capabilitylist-link', capabilitlylist_link ] | |
for e in exclude: | |
cmd.append('--exclude'); | |
cmd.append(e); | |
for m in mapping: | |
cmd.append(m) | |
logging.info( "Writing resourcelist with:" + ' '.join(cmd) ) | |
run(cmd) | |
capl['resourcelist'] = capsurl + '/resourcelist.xml' | |
##### Build a changset in capsurl/YYYY-MM-DD/changelist.xml etc | |
capl['changelist'] = capsurl + '/changelist.xml' | |
#changelist_url = capsurl + today + '/changelist.xml' | |
if (previous is not None): | |
previous_resourcelist_filename = os.path.join(capsdir,previous,'resourcelist.xml') | |
changelist_filename = os.path.join(capsdir,today,'changelist.xml') | |
cmd = [ client, '-v', | |
'--logger', '--logfile', logfile, | |
'--checksum', | |
'--changelist', | |
'--reference', previous_resourcelist_filename, | |
'--newreference', resourcelist_filename, | |
'--outfile', changelist_filename, | |
'--capabilitylist-link', capabilitlylist_link ] | |
for m in mapping: | |
cmd.append(m) | |
logging.info( "Writing changelist with:" + ' '.join(cmd) ) | |
run(cmd) | |
##### Copy resourcelistindexes for resourcelist and changelist to | |
##### /arxiv/resourcelist.xml, /arxiv/changelist.xml etc | |
logging.info( "Copying files into root also" ) | |
run("rm %s/resourcelist.xml" % capsdir, shell=True) | |
run("cp -p %s/%s/resourcelist.xml %s" % (capsdir,today,capsdir), shell=True) | |
run("rm %s/changelist.xml" % capsdir, shell=True) | |
if (previous is not None): | |
run("cp -p %s/%s/changelist.xml %s" % (capsdir,today,capsdir), shell=True) | |
##### Write out capabilities | |
caplstrs = [] | |
for c in capl.keys(): | |
caplstrs.append(c+'='+capl[c]) | |
capls = ','.join(caplstrs) | |
capl_filename = os.path.join(capsdir,'capabilitylist.xml') | |
cmd = [ client, '-v', '--capabilitylist', capls, | |
'--outfile', capl_filename, | |
'--describedby', 'http://resync.library.cornell.edu/', | |
'--sourcedescription-link', 'http://resync.library.cornell.edu/.well-known/resourcesync' ] | |
run(cmd) | |
logging.info( "Done" ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment