Created
September 11, 2013 18:40
-
-
Save gravesm/6527922 to your computer and use it in GitHub Desktop.
MarcXML record to Solr add doc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
######################## | |
# | |
# @TODO: Location field in current Solr index has both libRecord and mapRecord - | |
# should there be both or just libRecord? | |
# | |
######################## | |
from lxml import etree | |
import sys | |
import re | |
MARCNS = "http://www.loc.gov/MARC21/slim" | |
NSMAP = { | |
"marc": MARCNS, | |
} | |
######################## | |
# Regular expression to extract parts of coordinate string from 034 field | |
# | |
# $1 = Hemisphere: Could be any of: +,-,N,S,E,W, or None | |
# $2 = Degrees | |
# $4 = Minutes | |
# $6 = Seconds | |
######################## | |
coordreg = re.compile("^([NSEW+-])?(\d{3}(\.\d*)?)(\d{2}(\.\d*)?)?(\d{2}(\.\d*)?)?", re.IGNORECASE) | |
def convert(coordinate): | |
"""Convert a variety of coordinate formats into decimal degrees.""" | |
try: | |
parts = re.search(coordreg, coordinate).groups() | |
except AttributeError: | |
raise CoordinateError | |
negative = parts[0] and parts[0] in "WSws-" | |
decimal = float(parts[1]) + float(parts[3] or 0) / 60 + float(parts[5] or 0) / 3600 | |
if negative: | |
decimal = -decimal | |
return decimal | |
class CoordinateError(Exception): | |
pass | |
######################## | |
# Field functions | |
# | |
# A field function takes a set of nodes returned by an xpath and returns an | |
# iterator of tuples of the form (SolrFieldName, Value). | |
######################## | |
def abstract(nodes): | |
abstracts = [] | |
for node in nodes: | |
abstracts.append(" ".join(node.xpath("marc:subfield[@code='a']/text()", | |
namespaces=NSMAP))) | |
yield ("Abstract", " ".join(abstracts)) | |
def bbox(nodes): | |
doc = [] | |
path = "marc:subfield[@code='{0}']/text()" | |
w,e,n,s = [nodes[0].xpath(path.format(sub), namespaces=NSMAP)[0] for sub in "defg"] | |
try: | |
w,e,n,s = [convert(x) for x in (w,e,n,s)] | |
except CoordinateError: | |
# couldn't get a coordinate out of this | |
return () | |
for k,v in {"MinX": w, "MaxX": e, "MinY": s, "MaxY": n}.items(): | |
doc.append((k, str(v))) | |
width, height = abs(e - w), abs(n - s) | |
doc.append(('CenterX', str(w + width / 2))) | |
doc.append(('CenterY', str(s + height / 2))) | |
doc.append(('Area', str(width * height))) | |
doc.append(('HalfHeight', str(height / 2))) | |
doc.append(('HalfWidth', str(width / 2))) | |
return doc | |
def contentDate(nodes): | |
control = nodes[0].text | |
date = control[7:11] | |
try: | |
int(date) | |
return [("ContentDate", "{0}-01-01T00:00:00Z".format(date))] | |
except ValueError: | |
# not a date | |
return () | |
def layerDisplayName(nodes): | |
doc = ("LayerDisplayName", " ".join(nodes[0].xpath("marc:subfield/text()", | |
namespaces=NSMAP))) | |
yield doc | |
def layerId(nodes): | |
rid = nodes[0].text | |
doc = [ | |
("LayerId", "MIT.{0}".format(rid)), | |
("Location", '{{"libRecord": "http://library.mit.edu/item/{0}"}}'.format(rid)), | |
("Name", rid) | |
] | |
return doc | |
def placeKeywordsMulti(nodes): | |
keywords = set() | |
for node in nodes: | |
places = node.xpath("marc:subfield[@code='z']/text()", | |
namespaces=NSMAP) | |
keywords.update([place.rstrip(":;,. ") for place in places]) | |
for k in keywords: | |
yield ("PlaceKeywordsSort", k) | |
def placeKeywords(nodes): | |
keywords = [] | |
for k,v in placeKeywordsMulti(nodes): | |
keywords.append(v) | |
return [("PlaceKeywords", " ".join(keywords))] | |
def publisher(nodes): | |
pubs = nodes[0].xpath("marc:subfield[@code='b']/text()", namespaces=NSMAP) | |
if pubs: | |
return [("Publisher", pubs[0].rstrip(","))] | |
return () | |
def themeKeywordsMulti(nodes): | |
keywords = set() | |
for node in nodes: | |
places = node.xpath("marc:subfield[@code='a']/text()", | |
namespaces=NSMAP) | |
keywords.update([place.rstrip(":;,. ") for place in places]) | |
for k in keywords: | |
yield ("ThemeKeywordsSort", k) | |
def themeKeywords(nodes): | |
keywords = [] | |
for k,v in themeKeywordsMulti(nodes): | |
keywords.append(v) | |
return [("ThemeKeywords", " ".join(keywords))] | |
### End field functions ### | |
######################## | |
# Xpath expressions | |
# | |
# Map a nodeset to a field function. | |
######################## | |
xpaths = [ | |
(abstract, "marc:datafield[@tag='500'][marc:subfield[@code='a']]"), | |
(bbox, "marc:datafield[@tag='034'][marc:subfield[@code='d']]"), | |
(contentDate, "marc:controlfield[@tag='008']"), | |
(layerDisplayName, "marc:datafield[@tag=245]"), | |
(layerId, "marc:controlfield[@tag='001']"), | |
(placeKeywords, "marc:datafield[@tag='650'][marc:subfield[@code='z']]"), | |
(placeKeywordsMulti, "marc:datafield[@tag='650'][marc:subfield[@code='z']]"), | |
(publisher, "marc:datafield[@tag='260'][marc:subfield[@code='b']]"), | |
(themeKeywords, "marc:datafield[@tag='650'][marc:subfield[@code='a']]"), | |
(themeKeywordsMulti, "marc:datafield[@tag='650'][marc:subfield[@code='a']]"), | |
] | |
compiled_xpaths = [(k, etree.XPath(v, namespaces=NSMAP)) for k,v in xpaths] | |
def parse(record): | |
"""Parses a MARCXML record into a list of Solr (field, value) tuples.""" | |
doc = [] | |
# Set values which are the same for all lib records | |
doc.append(('Access', "Public")) | |
doc.append(("Availability", "Offline")) | |
doc.append(("DataType", "LibraryRecord")) | |
doc.append(("GeoReferenced", "False")) | |
doc.append(("Institution", "MIT")) | |
for handler, xpath in compiled_xpaths: | |
nodes = xpath(record) | |
if nodes: | |
[doc.append(f) for f in handler(nodes)] | |
return doc | |
def main(): | |
f = sys.argv[1] | |
fp = open(f) | |
marc = etree.parse(fp) | |
fp.close() | |
solr = etree.Element("add") | |
for record in marc.xpath("//marc:record", namespaces=NSMAP): | |
doc = etree.SubElement(solr, "doc") | |
for k,v in parse(record): | |
field = etree.Element("field", name=k) | |
field.text = v | |
doc.append(field) | |
print etree.tostring(solr) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment