Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Code for arXiv resync experiment
#!/usr/bin/env python
"""Update script for arXiv experiment on resync.library.cornell.edu
It is assumed that the files for abs and src will have been
updated before the script is run. The the sequence is:
- build an resourcelist in /arxiv/YYYY-MM-DD/resourcelist.xml etc
- build a changelist in /arxiv/YYYY-MM-DD/changelist.xml etc
- copy resourcelist and changelist to /arxiv/resourcelist.xml, /arxiv/changelist
.xml etc
Everything is logged to /arxiv/YYYY-MM-DD/log.txt
Designed to be run Monday through Friday in the early hours of the morning.
"""
import os.path
import datetime
import subprocess
import sys
import logging
import optparse
import ConfigParser
def run(cmd, shell=False):
"""Do subprocess.call with stderr redirected to stdout"""
subprocess.call(cmd, shell=shell)
# Get name of config file to read
p = optparse.OptionParser(
description='daily_update.py script',
usage='usage: %prog --config=config_file',
add_help_option=True)
p.add_option('--config', type=str, action='store',
help='Mandatory name of config file')
(opts,args) = p.parse_args()
if (opts.config is None):
raise(Exception("Must specify --config"))
##### config
config = ConfigParser.ConfigParser()
config.read(opts.config)
mydir = config.get('resync','mydir')
client = config.get('resync','client')
paths = config.get('resync','paths')
capsdir = config.get('resync','capsdir')
capsurl = config.get('resync','capsurl')
mapping = config.get('resync','mapping').split(' ')
exclude = config.get('resync','exclude').split(' ')
logging.basicConfig(level=logging.INFO,format='%(asctime)-15s %(message)s',stream=sys.stdout)
##### Find today and date of files from last run
today = datetime.date.today().isoformat()
logging.info( "Today is %s" % (today) )
d = datetime.date.today()
archive_dirs = []
previous = None
for n in range(0,100):
d -= datetime.timedelta(1)
tprevious = d.isoformat()
#print "lookin for %s" % (os.path.join(capsdir,previous))
if (os.path.isdir(os.path.join(capsdir,tprevious)) and
os.path.isfile(os.path.join(capsdir,tprevious,"resourcelist.xml"))):
archive_dirs.append(tprevious)
if (previous is None):
# get just the first one back
previous=tprevious
logging.info( "Previous run: %s" % (previous) )
logging.info( "Archives: %s" % (' '.join(archive_dirs)) )
##### Record capabilities as we go
capabilitlylist_link = capsurl + '/capabilitylist.xml'
capl = {}
##### Build a resourcelist in /arxiv/YYYY-MM-DD/resourcelist.xml etc
today_dir = os.path.join(capsdir,today)
if (not os.path.isdir(today_dir)):
os.mkdir(today_dir)
resourcelist_filename = os.path.join(capsdir,today,'resourcelist.xml')
logfile = os.path.join(capsdir,today,'log.txt')
cmd = [ client, '-v',
'--logger', '--logfile', logfile,
'--checksum',
'--resourcelist',
'--outfile', resourcelist_filename,
'--paths', paths,
'--capabilitylist-link', capabilitlylist_link ]
for e in exclude:
cmd.append('--exclude');
cmd.append(e);
for m in mapping:
cmd.append(m)
logging.info( "Writing resourcelist with:" + ' '.join(cmd) )
run(cmd)
capl['resourcelist'] = capsurl + '/resourcelist.xml'
##### Build a changset in capsurl/YYYY-MM-DD/changelist.xml etc
capl['changelist'] = capsurl + '/changelist.xml'
#changelist_url = capsurl + today + '/changelist.xml'
if (previous is not None):
previous_resourcelist_filename = os.path.join(capsdir,previous,'resourcelist.xml')
changelist_filename = os.path.join(capsdir,today,'changelist.xml')
cmd = [ client, '-v',
'--logger', '--logfile', logfile,
'--checksum',
'--changelist',
'--reference', previous_resourcelist_filename,
'--newreference', resourcelist_filename,
'--outfile', changelist_filename,
'--capabilitylist-link', capabilitlylist_link ]
for m in mapping:
cmd.append(m)
logging.info( "Writing changelist with:" + ' '.join(cmd) )
run(cmd)
##### Copy resourcelistindexes for resourcelist and changelist to
##### /arxiv/resourcelist.xml, /arxiv/changelist.xml etc
logging.info( "Copying files into root also" )
run("rm %s/resourcelist.xml" % capsdir, shell=True)
run("cp -p %s/%s/resourcelist.xml %s" % (capsdir,today,capsdir), shell=True)
run("rm %s/changelist.xml" % capsdir, shell=True)
if (previous is not None):
run("cp -p %s/%s/changelist.xml %s" % (capsdir,today,capsdir), shell=True)
##### Write out capabilities
caplstrs = []
for c in capl.keys():
caplstrs.append(c+'='+capl[c])
capls = ','.join(caplstrs)
capl_filename = os.path.join(capsdir,'capabilitylist.xml')
cmd = [ client, '-v', '--capabilitylist', capls,
'--outfile', capl_filename,
'--describedby', 'http://resync.library.cornell.edu/',
'--sourcedescription-link', 'http://resync.library.cornell.edu/.well-known/resourcesync' ]
run(cmd)
logging.info( "Done" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.