Skip to content

Instantly share code, notes, and snippets.

@lawlesst
Last active December 11, 2015 11:58
Show Gist options
  • Save lawlesst/4597432 to your computer and use it in GitHub Desktop.
Save lawlesst/4597432 to your computer and use it in GitHub Desktop.
Jython script to scan MARC records exported from III for updates.
"""
Scans III exports and finds records updated within a
particular time period.
Command line options:
-f file
-d days - number of days to go back and look for updates
-o output - directory to store the MARC records.
-s start - start date for updates
-e end - end dates for updates
"""
import os
import optparse
import sys
from datetime import date, timedelta
import setup_solrmarc
#java jazz
from java.lang import RuntimeException
from java.io import InputStream
from java.io import File, FileInputStream, ByteArrayOutputStream, FileOutputStream
from org.marc4j import MarcStreamReader, MarcPermissiveStreamReader, MarcStreamWriter
from org.solrmarc.tools import MarcUtils
#Default local directory to store written MARC records. Pass in -o to change.
OUTPUT_DIR = 'to_load'
def convert_date(datestr):
"""III Only reports two digit year numbers."""
from datetime import datetime, timedelta, date
dgroups = datestr.split('-')
try:
month = int(dgroups[0])
day = int(dgroups[1])
year = int(dgroups[2])
except ValueError, e:
#print>>sys.stderr, e, ' in date function.'
return
this_year = (date.today().year) - 2000
#If the year integer is greater than the current year then,
#pad with 1900.
if year > this_year:
year = 1900 + year
else:
year = 2000 + year
try:
return date(year, month, day) #.isoformat() + 'Z'
except ValueError:
return
def main(mfile=None,
days=1,
**kwargs):
#Are we doing a number of days check?
days_check = True
start = kwargs.get('start')
end = kwargs.get('end')
if start and end:
days_check = False
start = convert_date(start)
end = convert_date(end)
print>>sys.stderr, "Scanning records for updates between and including %s and %s." % (start, end)
else:
#do the days
today = date.today()
cutoff = today - timedelta(days=days)
print>>sys.stderr, "Scanning records for updates since %s." % cutoff
reader = setup_solrmarc.read_file(mfile)
#get the filename from input path for output
ofilename = mfile.split(os.path.sep)[-1]
odir = OUTPUT_DIR if kwargs.get('output_dir') is None else kwargs.get('output_dir')
out_file = File('%s/updates_%s' % (odir, ofilename))
#file output stream
fop = FileOutputStream(out_file)
writer = MarcStreamWriter(fop, "UTF-8")
written_records = 0
while reader.hasNext():
record = reader.next()
#print record
last_update_value = MarcUtils.getFieldList(record, "907b").toArray()[0];
last_update = convert_date(last_update_value)
if days_check:
if last_update >= cutoff:
writer.write(record)
written_records += 1
else:
if (last_update >= start) and (last_update <= end):
writer.write(record)
written_records += 1
# #Close MARC file handle.
writer.close()
print>>sys.stderr, '%d updated records written to %s' % (written_records, out_file)
if __name__ == "__main__":
p = optparse.OptionParser()
p.add_option('--file', '-f',
help="Pass in the file name.")
p.add_option('--days',
'-d',
default=1,
help='Pass in the number of days of updatees to\
pull from the catalog.',
)
p.add_option('--output',
'-o',
default=None,
help='Pass in the output directory.'
)
p.add_option('--start',
'-s',
default=None,
help='e.g. 02-01-10. Pass in the start date.'
)
p.add_option('--end',
'-e',
default=None,
help='e.g. 02-01-10. Pass in the end date.'
)
options, arguments = p.parse_args()
mfile = os.path.realpath(options.file)
days = int(options.days)
main(mfile=mfile,
output_dir=options.output,
days=days,
start=options.start,
end=options.end)
import sys
import os
BASE = os.path.join(os.path.dirname(__file__))
#Add jars - you could also just put these on your CLASSPATH.
jars = [
'lib/VuFindIndexer.jar',
#'lib/SolrMarc.jar',
'dist/SolrMarc.jar',
'lib/marc4j-2.5.1.beta.jar',
'lib/log4j-1.2.5.jar',
'lib/normalizer_solrmarc.jar'
]
for jar in jars:
sys.path.append(os.path.join(BASE, jar))
from org.marc4j import MarcStreamReader, MarcPermissiveStreamReader, MarcStreamWriter
from org.marc4j import ErrorHandler
from org.solrmarc.index import SolrIndexer, VuFindIndexer
from org.solrmarc.tools import MarcUtils, CallNumUtils, SolrMarcIndexerException
from com.solrmarc.icu.text import Normalizer
#from org.solrmarc.marc import MarcPermissiveStreamReader
#from org.solrmarc import *
#from org.solrmarc.tools import SolrUtils
#Add java class
from java.lang import RuntimeException
from java.io import InputStream
from java.io import File, FileInputStream, ByteArrayOutputStream, FileOutputStream
def read_file(infile):
inpath, inname = os.path.split(infile)
f = FileInputStream(infile)
#lifted from https://github.com/billdueber/marc_marc4j/blob/master/lib/marc/marc4j/reader.rb
reader = MarcPermissiveStreamReader(f, True, False, "BESTGUESS")
return reader
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment