Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Created April 29, 2016 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thisismattmiller/353709fd54245c49f6f62bb509f262a9 to your computer and use it in GitHub Desktop.
Save thisismattmiller/353709fd54245c49f6f62bb509f262a9 to your computer and use it in GitHub Desktop.
Shared collection conversion v1
import xml.etree.ElementTree as ET
from xml.dom.minidom import parseString
pretty_print = lambda data: '\n'.join([line for line in parseString(data).toprettyxml(indent=' '*2).split('\n') if line.strip()])
def remove_namespace(doc, namespace):
"""Remove namespace in the passed document in place."""
ns = u'{%s}' % namespace
nsl = len(ns)
for elem in doc.getiterator():
if elem.tag.startswith(ns):
elem.tag = elem.tag[nsl:]
metadata = 'RecapSharedCirc2a.xml'
tree = ET.parse(metadata)
root = tree.getroot()
# remove the namespace prefix on all the elements
remove_namespace(root, 'http://www.loc.gov/MARC21/slim')
bibRecords = ET.Element("bibRecords")
# loop through each marc:record
for record in root:
# this will be the new record/holding/items components
newBibRecord = ET.Element('record')
newHolding = ET.Element('record')
# the new items could be repeatable (?) so make the collection here and add them all in at the end of the elements
newItems = ET.Element('collection')
newItems.set('xmlns', 'http://www.loc.gov/MARC21/slim')
newItemsList = []
# update date
updateDate = "12-09-2014 12:00"
# gather the data we need
for aTag in record:
# the 'holdings'
if ('tag' in aTag.attrib and aTag.attrib['tag'] == '852'):
newHolding.append(aTag)
# the items collection
elif ('tag' in aTag.attrib and (aTag.attrib['tag'] == '876' or aTag.attrib['tag'] == '900')):
newItemsList.append(aTag)
# all the rest of the bib fields
else:
newBibRecord.append(aTag)
# grab the date if we can to add it in the update date (?)
if ('tag' in aTag.attrib and aTag.attrib['tag'] == '945'):
for aSubfield in aTag:
if ('code' in aSubfield.attrib and aSubfield.attrib['code'] == 'k'):
updateDate = aSubfield.text
# add in all the items we gathered
newitem = ET.Element('record')
for aItem in newItemsList:
newitem.append(aItem)
newItems.append(newitem)
# now we have all the components for each bibRecord, make that.
# make a new bib
bibRecord = ET.SubElement(bibRecords, 'bibRecord')
bib = ET.SubElement(bibRecord, "bib")
# add in the owningInstitutionId and lastUpdatedDate
owningInstitutionId = ET.Element('owningInstitutionId')
owningInstitutionId.text = 'NY'
bib.append(owningInstitutionId)
lastUpdatedDate = ET.Element('lastUpdatedDate')
lastUpdatedDate.text = updateDate
bib.append(lastUpdatedDate)
bibContent = ET.SubElement(bib, 'content')
bibCollection = ET.SubElement(bibContent, 'collection')
bibCollection.set('xmlns', 'http://www.loc.gov/MARC21/slim')
bibCollection.append(newBibRecord)
holdings = ET.SubElement(bibRecord, "holdings")
holding = ET.SubElement(holdings, "holding")
holdingLastUpdatedDate = ET.SubElement(holding, "lastUpdatedDate")
holdingLastUpdatedDate.text = updateDate
holdingOwningInstitutionHoldingsId = ET.SubElement(holding, "owningInstitutionHoldingsId")
holdingOwningInstitutionHoldingsId.text = "?"
holdingContent = ET.SubElement(holding, "content")
holdingCollection = ET.SubElement(holdingContent, "collection")
holdingCollection.set('xmlns', 'http://www.loc.gov/MARC21/slim')
holdingCollection.append(newHolding)
items = ET.SubElement(holding, "items")
itemContent = ET.SubElement(items, "content")
itemContent.append(newItems)
with open('output.xml', 'w') as f:
f.write(pretty_print(ET.tostring(bibRecords, 'utf-8')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment