Skip to content

Instantly share code, notes, and snippets.

@jermnelson
Created March 11, 2015 16:09
Show Gist options
  • Save jermnelson/b1d908044f02032d2953 to your computer and use it in GitHub Desktop.
Save jermnelson/b1d908044f02032d2953 to your computer and use it in GitHub Desktop.
Command-line script for converting MARC21 to Dublin Core XML
"""Python script for converting MARC21 to Dublin Core XML
Usage:
Command-line
$ mkdir output_xml
$ cp marc2dc.py output_xml/.
$ cd output_xml
$ python marc2dc.py --marc E:\Research\test-marc-file.mrc
"""
___author__ = "Jeremy Nelson"
__license__ = "GPLv3"
import argparse
import datetime
import pymarc
import urllib.request
from lxml import etree
MARC_DC_URL = 'http://www.loc.gov/standards/marcxml/xslt/MARC21slim2RDFDC.xsl'
class FileResolver(etree.Resolver):
"""Copied from Stackoverflow: http://stackoverflow.com/questions/8831941/lxml-and-xsl-document-function"""
def resolve(self, url, pubid, context):
return self.resolve_filename(url, context)
def run(marc21_file):
"""Function takes a marc21 filepath, converts each record to MARCXML,
run LOC MARC2DC XLST, and saves resulting DC XML to disk"""
reader = pymarc.MARCReader(open(marc21_file, 'rb'), to_unicode=True)
parser = etree.XMLParser()
parser.resolvers.add(FileResolver())
marc2dc_xslt = etree.parse(
urllib.request.urlopen(MARC_DC_URL), parser=parser)
transform = etree.XSLT(marc2dc_xslt)
for i, record in enumerate(reader):
# Convert to MARC XML
record_xml = etree.XML(
pymarc.record_to_xml(record, namespace=True),
parser=parser)
# Transform to Dublin Core RDF XML
dc_xml = transform(record_xml)
# Save to DC XML local directory
if '001' in record:
dc_filename = "dc-{}.xml".format(record['001'].data)
else:
dc_filename = "dc-marc-{}.xml".format(i)
with open(dc_filename, 'w+') as dc_file:
dc_file.write(etree.tostring(dc_xml).decode())
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--marc', help='Full Path to MARC21 file')
args = arg_parser.parse_args()
start = datetime.datetime.utcnow()
print("Starting MARC21 to Dublin Core at {}".format(start.isoformat()))
run(args.marc)
end = datetime.datetime.utcnow()
print("Finished MARC21 to Dublin Core at {}, total time={} seconds".format(
end.isoformat(),
(end-start).seconds))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment