Skip to content

Instantly share code, notes, and snippets.

@gravesm
Created September 11, 2013 18:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gravesm/6527922 to your computer and use it in GitHub Desktop.
Save gravesm/6527922 to your computer and use it in GitHub Desktop.
MarcXML record to Solr add doc
#!/usr/bin/env python
########################
#
# @TODO: Location field in current Solr index has both libRecord and mapRecord -
# should there be both or just libRecord?
#
########################
from lxml import etree
import sys
import re
MARCNS = "http://www.loc.gov/MARC21/slim"
NSMAP = {
"marc": MARCNS,
}
########################
# Regular expression to extract parts of coordinate string from 034 field
#
# $1 = Hemisphere: Could be any of: +,-,N,S,E,W, or None
# $2 = Degrees
# $4 = Minutes
# $6 = Seconds
########################
coordreg = re.compile("^([NSEW+-])?(\d{3}(\.\d*)?)(\d{2}(\.\d*)?)?(\d{2}(\.\d*)?)?", re.IGNORECASE)
def convert(coordinate):
"""Convert a variety of coordinate formats into decimal degrees."""
try:
parts = re.search(coordreg, coordinate).groups()
except AttributeError:
raise CoordinateError
negative = parts[0] and parts[0] in "WSws-"
decimal = float(parts[1]) + float(parts[3] or 0) / 60 + float(parts[5] or 0) / 3600
if negative:
decimal = -decimal
return decimal
class CoordinateError(Exception):
pass
########################
# Field functions
#
# A field function takes a set of nodes returned by an xpath and returns an
# iterator of tuples of the form (SolrFieldName, Value).
########################
def abstract(nodes):
abstracts = []
for node in nodes:
abstracts.append(" ".join(node.xpath("marc:subfield[@code='a']/text()",
namespaces=NSMAP)))
yield ("Abstract", " ".join(abstracts))
def bbox(nodes):
doc = []
path = "marc:subfield[@code='{0}']/text()"
w,e,n,s = [nodes[0].xpath(path.format(sub), namespaces=NSMAP)[0] for sub in "defg"]
try:
w,e,n,s = [convert(x) for x in (w,e,n,s)]
except CoordinateError:
# couldn't get a coordinate out of this
return ()
for k,v in {"MinX": w, "MaxX": e, "MinY": s, "MaxY": n}.items():
doc.append((k, str(v)))
width, height = abs(e - w), abs(n - s)
doc.append(('CenterX', str(w + width / 2)))
doc.append(('CenterY', str(s + height / 2)))
doc.append(('Area', str(width * height)))
doc.append(('HalfHeight', str(height / 2)))
doc.append(('HalfWidth', str(width / 2)))
return doc
def contentDate(nodes):
control = nodes[0].text
date = control[7:11]
try:
int(date)
return [("ContentDate", "{0}-01-01T00:00:00Z".format(date))]
except ValueError:
# not a date
return ()
def layerDisplayName(nodes):
doc = ("LayerDisplayName", " ".join(nodes[0].xpath("marc:subfield/text()",
namespaces=NSMAP)))
yield doc
def layerId(nodes):
rid = nodes[0].text
doc = [
("LayerId", "MIT.{0}".format(rid)),
("Location", '{{"libRecord": "http://library.mit.edu/item/{0}"}}'.format(rid)),
("Name", rid)
]
return doc
def placeKeywordsMulti(nodes):
keywords = set()
for node in nodes:
places = node.xpath("marc:subfield[@code='z']/text()",
namespaces=NSMAP)
keywords.update([place.rstrip(":;,. ") for place in places])
for k in keywords:
yield ("PlaceKeywordsSort", k)
def placeKeywords(nodes):
keywords = []
for k,v in placeKeywordsMulti(nodes):
keywords.append(v)
return [("PlaceKeywords", " ".join(keywords))]
def publisher(nodes):
pubs = nodes[0].xpath("marc:subfield[@code='b']/text()", namespaces=NSMAP)
if pubs:
return [("Publisher", pubs[0].rstrip(","))]
return ()
def themeKeywordsMulti(nodes):
keywords = set()
for node in nodes:
places = node.xpath("marc:subfield[@code='a']/text()",
namespaces=NSMAP)
keywords.update([place.rstrip(":;,. ") for place in places])
for k in keywords:
yield ("ThemeKeywordsSort", k)
def themeKeywords(nodes):
keywords = []
for k,v in themeKeywordsMulti(nodes):
keywords.append(v)
return [("ThemeKeywords", " ".join(keywords))]
### End field functions ###
########################
# Xpath expressions
#
# Map a nodeset to a field function.
########################
xpaths = [
(abstract, "marc:datafield[@tag='500'][marc:subfield[@code='a']]"),
(bbox, "marc:datafield[@tag='034'][marc:subfield[@code='d']]"),
(contentDate, "marc:controlfield[@tag='008']"),
(layerDisplayName, "marc:datafield[@tag=245]"),
(layerId, "marc:controlfield[@tag='001']"),
(placeKeywords, "marc:datafield[@tag='650'][marc:subfield[@code='z']]"),
(placeKeywordsMulti, "marc:datafield[@tag='650'][marc:subfield[@code='z']]"),
(publisher, "marc:datafield[@tag='260'][marc:subfield[@code='b']]"),
(themeKeywords, "marc:datafield[@tag='650'][marc:subfield[@code='a']]"),
(themeKeywordsMulti, "marc:datafield[@tag='650'][marc:subfield[@code='a']]"),
]
compiled_xpaths = [(k, etree.XPath(v, namespaces=NSMAP)) for k,v in xpaths]
def parse(record):
"""Parses a MARCXML record into a list of Solr (field, value) tuples."""
doc = []
# Set values which are the same for all lib records
doc.append(('Access', "Public"))
doc.append(("Availability", "Offline"))
doc.append(("DataType", "LibraryRecord"))
doc.append(("GeoReferenced", "False"))
doc.append(("Institution", "MIT"))
for handler, xpath in compiled_xpaths:
nodes = xpath(record)
if nodes:
[doc.append(f) for f in handler(nodes)]
return doc
def main():
f = sys.argv[1]
fp = open(f)
marc = etree.parse(fp)
fp.close()
solr = etree.Element("add")
for record in marc.xpath("//marc:record", namespaces=NSMAP):
doc = etree.SubElement(solr, "doc")
for k,v in parse(record):
field = etree.Element("field", name=k)
field.text = v
doc.append(field)
print etree.tostring(solr)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment