Skip to content

Instantly share code, notes, and snippets.

@zimeon
Created October 7, 2014 01:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zimeon/fc1c2b3db3f0e7febc86 to your computer and use it in GitHub Desktop.
Save zimeon/fc1c2b3db3f0e7febc86 to your computer and use it in GitHub Desktop.
Code for arXiv resync experiment
#!/usr/bin/env python
"""Update script for arXiv experiment on resync.library.cornell.edu
It is assumed that the files for abs and src will have been
updated before the script is run. The the sequence is:
- build an resourcelist in /arxiv/YYYY-MM-DD/resourcelist.xml etc
- build a changelist in /arxiv/YYYY-MM-DD/changelist.xml etc
- copy resourcelist and changelist to /arxiv/resourcelist.xml, /arxiv/changelist
.xml etc
Everything is logged to /arxiv/YYYY-MM-DD/log.txt
Designed to be run Monday through Friday in the early hours of the morning.
"""
import os.path
import datetime
import subprocess
import sys
import logging
import optparse
import ConfigParser
def run(cmd, shell=False):
"""Do subprocess.call with stderr redirected to stdout"""
subprocess.call(cmd, shell=shell)
# Get name of config file to read
p = optparse.OptionParser(
description='daily_update.py script',
usage='usage: %prog --config=config_file',
add_help_option=True)
p.add_option('--config', type=str, action='store',
help='Mandatory name of config file')
(opts,args) = p.parse_args()
if (opts.config is None):
raise(Exception("Must specify --config"))
##### config
config = ConfigParser.ConfigParser()
config.read(opts.config)
mydir = config.get('resync','mydir')
client = config.get('resync','client')
paths = config.get('resync','paths')
capsdir = config.get('resync','capsdir')
capsurl = config.get('resync','capsurl')
mapping = config.get('resync','mapping').split(' ')
exclude = config.get('resync','exclude').split(' ')
logging.basicConfig(level=logging.INFO,format='%(asctime)-15s %(message)s',stream=sys.stdout)
##### Find today and date of files from last run
today = datetime.date.today().isoformat()
logging.info( "Today is %s" % (today) )
d = datetime.date.today()
archive_dirs = []
previous = None
for n in range(0,100):
d -= datetime.timedelta(1)
tprevious = d.isoformat()
#print "lookin for %s" % (os.path.join(capsdir,previous))
if (os.path.isdir(os.path.join(capsdir,tprevious)) and
os.path.isfile(os.path.join(capsdir,tprevious,"resourcelist.xml"))):
archive_dirs.append(tprevious)
if (previous is None):
# get just the first one back
previous=tprevious
logging.info( "Previous run: %s" % (previous) )
logging.info( "Archives: %s" % (' '.join(archive_dirs)) )
##### Record capabilities as we go
capabilitlylist_link = capsurl + '/capabilitylist.xml'
capl = {}
##### Build a resourcelist in /arxiv/YYYY-MM-DD/resourcelist.xml etc
today_dir = os.path.join(capsdir,today)
if (not os.path.isdir(today_dir)):
os.mkdir(today_dir)
resourcelist_filename = os.path.join(capsdir,today,'resourcelist.xml')
logfile = os.path.join(capsdir,today,'log.txt')
cmd = [ client, '-v',
'--logger', '--logfile', logfile,
'--checksum',
'--resourcelist',
'--outfile', resourcelist_filename,
'--paths', paths,
'--capabilitylist-link', capabilitlylist_link ]
for e in exclude:
cmd.append('--exclude');
cmd.append(e);
for m in mapping:
cmd.append(m)
logging.info( "Writing resourcelist with:" + ' '.join(cmd) )
run(cmd)
capl['resourcelist'] = capsurl + '/resourcelist.xml'
##### Build a changset in capsurl/YYYY-MM-DD/changelist.xml etc
capl['changelist'] = capsurl + '/changelist.xml'
#changelist_url = capsurl + today + '/changelist.xml'
if (previous is not None):
previous_resourcelist_filename = os.path.join(capsdir,previous,'resourcelist.xml')
changelist_filename = os.path.join(capsdir,today,'changelist.xml')
cmd = [ client, '-v',
'--logger', '--logfile', logfile,
'--checksum',
'--changelist',
'--reference', previous_resourcelist_filename,
'--newreference', resourcelist_filename,
'--outfile', changelist_filename,
'--capabilitylist-link', capabilitlylist_link ]
for m in mapping:
cmd.append(m)
logging.info( "Writing changelist with:" + ' '.join(cmd) )
run(cmd)
##### Copy resourcelistindexes for resourcelist and changelist to
##### /arxiv/resourcelist.xml, /arxiv/changelist.xml etc
logging.info( "Copying files into root also" )
run("rm %s/resourcelist.xml" % capsdir, shell=True)
run("cp -p %s/%s/resourcelist.xml %s" % (capsdir,today,capsdir), shell=True)
run("rm %s/changelist.xml" % capsdir, shell=True)
if (previous is not None):
run("cp -p %s/%s/changelist.xml %s" % (capsdir,today,capsdir), shell=True)
##### Write out capabilities
caplstrs = []
for c in capl.keys():
caplstrs.append(c+'='+capl[c])
capls = ','.join(caplstrs)
capl_filename = os.path.join(capsdir,'capabilitylist.xml')
cmd = [ client, '-v', '--capabilitylist', capls,
'--outfile', capl_filename,
'--describedby', 'http://resync.library.cornell.edu/',
'--sourcedescription-link', 'http://resync.library.cornell.edu/.well-known/resourcesync' ]
run(cmd)
logging.info( "Done" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment