Created
April 29, 2016 13:57
-
-
Save thisismattmiller/353709fd54245c49f6f62bb509f262a9 to your computer and use it in GitHub Desktop.
Shared collection conversion v1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
from xml.dom.minidom import parseString | |
pretty_print = lambda data: '\n'.join([line for line in parseString(data).toprettyxml(indent=' '*2).split('\n') if line.strip()]) | |
def remove_namespace(doc, namespace): | |
"""Remove namespace in the passed document in place.""" | |
ns = u'{%s}' % namespace | |
nsl = len(ns) | |
for elem in doc.getiterator(): | |
if elem.tag.startswith(ns): | |
elem.tag = elem.tag[nsl:] | |
metadata = 'RecapSharedCirc2a.xml' | |
tree = ET.parse(metadata) | |
root = tree.getroot() | |
# remove the namespace prefix on all the elements | |
remove_namespace(root, 'http://www.loc.gov/MARC21/slim') | |
bibRecords = ET.Element("bibRecords") | |
# loop through each marc:record | |
for record in root: | |
# this will be the new record/holding/items components | |
newBibRecord = ET.Element('record') | |
newHolding = ET.Element('record') | |
# the new items could be repeatable (?) so make the collection here and add them all in at the end of the elements | |
newItems = ET.Element('collection') | |
newItems.set('xmlns', 'http://www.loc.gov/MARC21/slim') | |
newItemsList = [] | |
# update date | |
updateDate = "12-09-2014 12:00" | |
# gather the data we need | |
for aTag in record: | |
# the 'holdings' | |
if ('tag' in aTag.attrib and aTag.attrib['tag'] == '852'): | |
newHolding.append(aTag) | |
# the items collection | |
elif ('tag' in aTag.attrib and (aTag.attrib['tag'] == '876' or aTag.attrib['tag'] == '900')): | |
newItemsList.append(aTag) | |
# all the rest of the bib fields | |
else: | |
newBibRecord.append(aTag) | |
# grab the date if we can to add it in the update date (?) | |
if ('tag' in aTag.attrib and aTag.attrib['tag'] == '945'): | |
for aSubfield in aTag: | |
if ('code' in aSubfield.attrib and aSubfield.attrib['code'] == 'k'): | |
updateDate = aSubfield.text | |
# add in all the items we gathered | |
newitem = ET.Element('record') | |
for aItem in newItemsList: | |
newitem.append(aItem) | |
newItems.append(newitem) | |
# now we have all the components for each bibRecord, make that. | |
# make a new bib | |
bibRecord = ET.SubElement(bibRecords, 'bibRecord') | |
bib = ET.SubElement(bibRecord, "bib") | |
# add in the owningInstitutionId and lastUpdatedDate | |
owningInstitutionId = ET.Element('owningInstitutionId') | |
owningInstitutionId.text = 'NY' | |
bib.append(owningInstitutionId) | |
lastUpdatedDate = ET.Element('lastUpdatedDate') | |
lastUpdatedDate.text = updateDate | |
bib.append(lastUpdatedDate) | |
bibContent = ET.SubElement(bib, 'content') | |
bibCollection = ET.SubElement(bibContent, 'collection') | |
bibCollection.set('xmlns', 'http://www.loc.gov/MARC21/slim') | |
bibCollection.append(newBibRecord) | |
holdings = ET.SubElement(bibRecord, "holdings") | |
holding = ET.SubElement(holdings, "holding") | |
holdingLastUpdatedDate = ET.SubElement(holding, "lastUpdatedDate") | |
holdingLastUpdatedDate.text = updateDate | |
holdingOwningInstitutionHoldingsId = ET.SubElement(holding, "owningInstitutionHoldingsId") | |
holdingOwningInstitutionHoldingsId.text = "?" | |
holdingContent = ET.SubElement(holding, "content") | |
holdingCollection = ET.SubElement(holdingContent, "collection") | |
holdingCollection.set('xmlns', 'http://www.loc.gov/MARC21/slim') | |
holdingCollection.append(newHolding) | |
items = ET.SubElement(holding, "items") | |
itemContent = ET.SubElement(items, "content") | |
itemContent.append(newItems) | |
with open('output.xml', 'w') as f: | |
f.write(pretty_print(ET.tostring(bibRecords, 'utf-8'))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment