Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Last active May 24, 2016 22:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thisismattmiller/e25f718047e95a2908b0b890d237d12c to your computer and use it in GitHub Desktop.
Save thisismattmiller/e25f718047e95a2908b0b890d237d12c to your computer and use it in GitHub Desktop.
Convert MARC XML from Sierra export table into Shared Collection format for the sample 10K data load

To run the script:

  • You need python3.4+
  • Download the recap_sample_convert.py
  • run the script and pass the filename: python3.4 recap_sample_convert.py example.xml
  • example.xml should be the data file exported from MARC Edit

It will create a file in the same directory converted with the same filename and "_converted.xml" appended.

import xml.etree.ElementTree as ET
from xml.dom.minidom import parseString
import sys, time
# customer code mappings from the google doc
customerCodes = {
"43": ["Private","XX"],
"209": ["Private","XX"],
"210": ["Private","JS"],
"211": ["Shared","GN"],
"212": ["Shared","JN"],
"213": ["Private","JO"],
"214": ["Shared","NA"],
"215": ["Shared","NB"],
"216": ["Private","ND"],
"217": ["Private","NH"],
"218": ["Private","NL"],
"219": ["Private","NN"],
"220": ["Private","NO"],
"221": ["Private","NP"],
"222": ["Private","NQ"],
"223": ["Private","NR"],
"224": ["Private","NS"],
"225": ["Private","NU"],
"226": ["Private","NV"],
"227": ["Shared","NW"],
"228": ["Private","NX"],
"230": ["Private","NZ"]
}
# pretty print the XML at the end
pretty_print = lambda data: '\n'.join([line for line in parseString(data).toprettyxml(indent=' '*2).split('\n') if line.strip()])
def remove_namespace(doc, namespace):
"""Remove namespace in the passed document in place."""
ns = u'{%s}' % namespace
nsl = len(ns)
for elem in doc.getiterator():
if elem.tag.startswith(ns):
elem.tag = elem.tag[nsl:]
def convert(filename):
tree = ET.parse(filename)
root = tree.getroot()
# remove the namespace prefix on all the elements
remove_namespace(root, 'http://www.loc.gov/MARC21/slim')
bibRecords = ET.Element("bibRecords")
noCodes = []
noceCodesIndex = []
# loop through each marc:record
for record in root:
# this will be the new record/holding/items components
newBibRecord = ET.Element('record')
newHolding = False
newItemsDict = {}
fakeHoldings852TextHoldings = {}
fakeHoldings852Locations = {}
item852Data = {}
# gather some values
for aTag in record:
if ('tag' in aTag.attrib and (aTag.attrib['tag'] == '852')):
iNumber = False
for sub in aTag:
if 'code' in sub.attrib and sub.attrib['code'] == 'a':
iNumber = sub.text
item852Data[iNumber] = { 'b' : False, 'h' : False, '3' : False }
# got the item number now
for sub in aTag:
if 'code' in sub.attrib and sub.attrib['code'] == 'b':
item852Data[iNumber]['b'] = sub.text
if 'code' in sub.attrib and sub.attrib['code'] == 'h':
item852Data[iNumber]['h'] = sub.text
if 'code' in sub.attrib and sub.attrib['code'] == '3':
item852Data[iNumber]['3'] = sub.text
if item852Data[iNumber]['h'] == False:
# try to use the bib call number
for aSubTag in record:
if ('tag' in aSubTag.attrib and (aSubTag.attrib['tag'] == '952')):
for subSub in aSubTag:
if 'code' in subSub.attrib and subSub.attrib['code'] == 'h':
item852Data[iNumber]['h'] = subSub.text
break
if item852Data[iNumber]['h'] == False:
print("ERROR! No callnumber found for this one:",item852Data[iNumber])
if iNumber.find('.i') == -1:
print ("No iNumber",iNumber,iNumber.find('.i'))
print(item852Data)
item876Data = {}
# gather some values
for aTag in record:
if ('tag' in aTag.attrib and (aTag.attrib['tag'] == '876')):
iNumber = False
for sub in aTag:
if 'code' in sub.attrib and sub.attrib['code'] == 'a':
iNumber = sub.text
item876Data[iNumber] = { 'o' : False, 's' : False, 'y' : False }
# got the item number now
for sub in aTag:
if 'code' in sub.attrib and sub.attrib['code'] == 'o':
item876Data[iNumber]['o'] = str(sub.text)
if 'code' in sub.attrib and sub.attrib['code'] == 's':
item876Data[iNumber]['s'] = str(sub.text)
if 'code' in sub.attrib and sub.attrib['code'] == 'y':
item876Data[iNumber]['y'] = str(sub.text)
# for x in item852Data:
# print(item852Data[x]['h'])
holdingsId = False
for aTag in record:
# the 'holdings'
if ('tag' in aTag.attrib and (aTag.attrib['tag'] == '866')):
for sub in aTag:
if 'code' in sub.attrib and sub.attrib['code'] == 'y':
holdingsId = sub.text
if 'code' in sub.attrib and sub.attrib['code'] == 'a':
new866 = ET.Element('datafield')
new866.attrib['tag'] = "866"
new866.attrib['ind1'] = "0"
new866.attrib['ind2'] = "0"
new866.append(sub)
newHolding = new866
# the items collection
if ('tag' in aTag.attrib and (aTag.attrib['tag'] == '876')):
# build the data for this item
item876SubData = {}
for sub in aTag:
if 'code' in sub.attrib:
item876SubData[sub.attrib['code']] = str(sub.text)
# for convience
iNumber = item876SubData['a']
# figure out the logic behind 876$h and 900$a
h876 = False
a900 = False
b900 = False
if iNumber in item852Data and iNumber in item876Data:
if 's' in item876Data[iNumber]:
if (item876Data[iNumber]['s'] in customerCodes):
a900 = customerCodes[item876Data[iNumber]['s']][0]
b900 = customerCodes[item876Data[iNumber]['s']][1]
else:
if item876Data[iNumber]['s'] not in noceCodesIndex:
noCodes.append([item876Data[iNumber]['s'],iNumber])
noceCodesIndex.append(item876Data[iNumber]['s'])
# 900$a Shared $b NA & 876$h Circulates
# 852$b Location starts with "rc2" (rc2=not restricted)
# 876$o OPAC Message equal to "2" (2=Advanced Request offsite)
# 876$s Item Agency equal to "214" (214=NA=Shared)
# 876$y Item Type equal to "55" (55=MaRLI)
# -----------------------------
# 852$b Location starts with "rc2" (rc2=not restricted)
if 'b' in item852Data[iNumber] and item852Data[iNumber]['b'][0:3] == 'rc2':
# 876$o OPAC Message equal to "2" (2=Advanced Request offsite)
if 'o' in item876Data[iNumber] and item876Data[iNumber]['o'] == '2':
# 876$s Item Agency equal to "214" (214=NA=Shared)
if 's' in item876Data[iNumber] and item876Data[iNumber]['s'] == '214':
# 876$y Item Type equal to "55" (55=MaRLI)
if 'y' in item876Data[iNumber] and item876Data[iNumber]['y'] == '55':
h876 = 'Circulates'
a900 = 'Shared'
b900 = 'NA'
# 900$a Shared $b NA & 876$h Supervised use
# 852$b Location starts with "rc2"
# 852$h Call No. All Fields don't have "*q"
# 876$o OPAC Message equal to "u"
# 876$s Item Agency equal to "214" (214=NA=Shared)
# -----------------------------
# 852$b Location starts with "rc2"
if 'b' in item852Data[iNumber] and item852Data[iNumber]['b'][0:3] == 'rc2':
# 852$h Call No. All Fields don't have "*q"
if 'h' in item852Data[iNumber] and str(item852Data[iNumber]['h']).lower().find('*q') == -1:
# 876$o OPAC Message equal to "u"
if 'o' in item876Data[iNumber] and item876Data[iNumber]['o'] == 'u':
# 876$s Item Agency equal to "214" (214=NA=Shared)
if 's' in item876Data[iNumber] and item876Data[iNumber]['s'] == '214':
h876 = 'Supervised use'
a900 = 'Shared'
b900 = 'NA'
# 900$a Shared $b NA & 876$h Use on site
# 852$b Location starts with "rc2"
# 876$o OPAC Message equal to "2"
# 876$s Item Agency equal to "214"
# 876$y Item Type equal to "2" or "3"
# -----------------------------
# 852$b Location starts with "rc2"
if 'b' in item852Data[iNumber] and item852Data[iNumber]['b'][0:3] == 'rc2':
# 876$o OPAC Message equal to "2"
if 'o' in item876Data[iNumber] and item876Data[iNumber]['o'] == '2':
# 876$s Item Agency equal to "214"
if 's' in item876Data[iNumber] and item876Data[iNumber]['s'] == '214':
# 876$y Item Type equal to "2" or "3"
if 'y' in item876Data[iNumber] and (item876Data[iNumber]['y'] == '2' or item876Data[iNumber]['y'] == '3'):
h876 = 'Use on site'
a900 = 'Shared'
b900 = 'NA'
# 900$a Private $b NO & 876$h [blank]?
# 852$b ITEM Location starts with "rc"
# 852$h Call No. All Fields don't have "*o"
# 876$s Item Agency equal to "220" (220=NO=Private)
# 876$y Item Type equal to "2"
# -----------------------------
# 852$b ITEM Location starts with "rc"
if 'b' in item852Data[iNumber] and item852Data[iNumber]['b'][0:2] == 'rc':
# 852$h Call No. All Fields don't have "*o"
if 'h' in item852Data[iNumber] and str(item852Data[iNumber]['h']).lower().find('*o') == -1:
# 876$s Item Agency equal to "220" (220=NO=Private)
if 's' in item876Data[iNumber] and item876Data[iNumber]['s'] == '220':
# 876$y Item Type equal to "2"
if 'y' in item876Data[iNumber] and item876Data[iNumber]['y'] == '2':
h876 = 'Supervised Use'
a900 = 'Private'
b900 = 'NO'
if h876 == False:
# print("Did not fit into User Restriction logic: ",iNumber)
h876 = 'In Library Use'
a900 = 'Private'
# make this new 876
new876 = ET.Element('datafield')
new876.attrib['tag'] = "876"
new876.attrib['ind1'] = "0"
new876.attrib['ind2'] = "0"
# we have this from above logic
if h876:
subfield = ET.SubElement(new876, 'subfield')
subfield.attrib['code'] = 'h'
subfield.text = h876
if 'a' in item876SubData:
subfield = ET.SubElement(new876, 'subfield')
subfield.attrib['code'] = 'a'
subfield.text = item876SubData['a']
else:
print(iNumber, "No 876$a")
if 'j' in item876SubData:
subfield = ET.SubElement(new876, 'subfield')
subfield.attrib['code'] = 'j'
subfield.text = item876SubData['j']
else:
print(iNumber, "No 876$p")
if 'p' in item876SubData:
subfield = ET.SubElement(new876, 'subfield')
subfield.attrib['code'] = 'p'
subfield.text = item876SubData['p']
else:
print(iNumber, "No 876$p")
if 't' in item876SubData:
subfield = ET.SubElement(new876, 'subfield')
subfield.attrib['code'] = 't'
subfield.text = item876SubData['t']
else:
print(iNumber, "No 876$t")
if '3' in item852Data[iNumber]:
subfield = ET.SubElement(new876, 'subfield')
subfield.attrib['code'] = '3'
subfield.text = item852Data[iNumber]['3']
else:
print(iNumber, "No 876$3")
# if 'u' in item876SubData:
# subfield = ET.SubElement(new876, 'subfield')
# subfield.attrib['code'] = 'u'
# subfield.text = item876SubData['u']
# if 'v' in item876SubData:
# subfield = ET.SubElement(new876, 'subfield')
# subfield.attrib['code'] = 'v'
# subfield.text = item876SubData['v']
# new 900
new900 = ET.Element('datafield')
new900.attrib['tag'] = "900"
new900.attrib['ind1'] = "0"
new900.attrib['ind2'] = "0"
if a900:
subfield = ET.SubElement(new900, 'subfield')
subfield.attrib['code'] = 'a'
subfield.text = a900
subfield = ET.SubElement(new900, 'subfield')
subfield.attrib['code'] = 'b'
subfield.text = b900
# newItemsList.append([new876, new900])
if str(item852Data[iNumber]['h']) not in newItemsDict:
newItemsDict[str(item852Data[iNumber]['h'])] = []
newItemsDict[str(item852Data[iNumber]['h'])].append([new876, new900])
# all the rest of the bib fields
else:
# not the 852 item
if ('tag' in aTag.attrib and (aTag.attrib['tag'] != '852' and aTag.attrib['tag'] != '866')):
newBibRecord.append(aTag)
# build the holdings textual description if needed
for x in item852Data:
if str(item852Data[x]['h']) not in fakeHoldings852TextHoldings:
fakeHoldings852TextHoldings[str(item852Data[x]['h'])] = []
if item852Data[x]['3'] != False:
fakeHoldings852TextHoldings[str(item852Data[x]['h'])].append(item852Data[x]['3'])
if str(item852Data[x]['h']) not in fakeHoldings852Locations:
fakeHoldings852Locations[str(item852Data[x]['h'])] = []
if item852Data[x]['b'] != False:
fakeHoldings852Locations[str(item852Data[x]['h'])].append(item852Data[x]['b'])
# now we have all the components for each bibRecord, make that.
# make a new bib
bibRecord = ET.SubElement(bibRecords, 'bibRecord')
bib = ET.SubElement(bibRecord, "bib")
# add in the owningInstitutionId and lastUpdatedDate
owningInstitutionId = ET.Element('owningInstitutionId')
owningInstitutionId.text = 'NYPL'
bib.append(owningInstitutionId)
bibContent = ET.SubElement(bib, 'content')
bibCollection = ET.SubElement(bibContent, 'collection')
bibCollection.set('xmlns', 'http://www.loc.gov/MARC21/slim')
bibCollection.append(newBibRecord)
holdings = ET.SubElement(bibRecord, "holdings")
if holdingsId:
holding = ET.SubElement(holdings, "holding")
holdingOwningInstitutionHoldingsId = ET.SubElement(holding, "owningInstitutionHoldingsId")
holdingOwningInstitutionHoldingsId.text = holdingsId
holdingContent = ET.SubElement(holding, "content")
holdingCollection = ET.SubElement(holdingContent, "collection")
holdingCollection.set('xmlns', 'http://www.loc.gov/MARC21/slim')
holdingRecord = ET.SubElement(holdingCollection, "record")
aHolding = list(newItemsDict.keys())[0]
# create the 852
fakeHoldingRecord852 = ET.SubElement(holdingRecord,"datafield")
# add in the two fields of the fake holdings record
fakeHoldingRecord852.attrib['tag'] = '852'
fakeHoldingRecord852.attrib['ind1'] = '0'
fakeHoldingRecord852.attrib['ind2'] = '0'
newHolding852b = ET.SubElement(fakeHoldingRecord852, "subfield")
newHolding852b.attrib['code'] = 'b'
newHolding852b.text = fakeHoldings852Locations[aHolding][0]
newHolding852h = ET.SubElement(fakeHoldingRecord852, "subfield")
newHolding852h.attrib['code'] = 'h'
newHolding852h.text = aHolding
# the 866
if newHolding != False:
holdingRecord.append(newHolding)
else:
holdingsText = []
for aHolding in newItemsDict:
for x in fakeHoldings852TextHoldings[aHolding]:
holdingsText.append(x)
# there was no aggergated holdings text in the 866 so make our own
fakeHoldingRecord866 = ET.SubElement(holdingRecord,"datafield")
fakeHoldingRecord866.attrib['tag'] = '866'
fakeHoldingRecord866.attrib['ind1'] = '0'
fakeHoldingRecord866.attrib['ind2'] = '0'
# we need to create a fake holdings text statement
newHolding866a = ET.SubElement(fakeHoldingRecord866, "subfield")
newHolding866a.attrib['code'] = 'a'
newHolding866a.text = ", ".join(holdingsText)
items = ET.SubElement(holding, "items")
itemContent = ET.SubElement(items, "content")
newItems = ET.SubElement(itemContent,'collection')
newItems.set('xmlns', 'http://www.loc.gov/MARC21/slim')
for aHolding in newItemsDict:
for aItem in newItemsDict[aHolding]:
newitem = ET.Element('record')
newitem.append(aItem[0])
newitem.append(aItem[1])
newItems.append(newitem)
else:
# there is no holdings ID
# add in a holdings for each differnt set of items with call numbers
for aHolding in newItemsDict:
fakeHoldingRecord = ET.Element("record")
holding = ET.SubElement(holdings, "holding")
holdingOwningInstitutionHoldingsId = ET.SubElement(holding, "owningInstitutionHoldingsId")
holdingContent = ET.SubElement(holding, "content")
holdingCollection = ET.SubElement(holdingContent, "collection")
holdingCollection.set('xmlns', 'http://www.loc.gov/MARC21/slim')
fakeHoldingRecord852 = ET.Element("datafield")
# add in the two fields of the fake holdings record
fakeHoldingRecord852.attrib['tag'] = '852'
fakeHoldingRecord852.attrib['ind1'] = '0'
fakeHoldingRecord852.attrib['ind2'] = '0'
newHolding852b = ET.SubElement(fakeHoldingRecord852, "subfield")
newHolding852b.attrib['code'] = 'b'
newHolding852b.text = fakeHoldings852Locations[aHolding][0]
newHolding852h = ET.SubElement(fakeHoldingRecord852, "subfield")
newHolding852h.attrib['code'] = 'h'
newHolding852h.text = aHolding
fakeHoldingRecord866 = ET.Element("datafield")
fakeHoldingRecord866.attrib['tag'] = '866'
fakeHoldingRecord866.attrib['ind1'] = '0'
fakeHoldingRecord866.attrib['ind2'] = '0'
# we need to create a fake holdings text statement
newHolding866a = ET.SubElement(fakeHoldingRecord866, "subfield")
newHolding866a.attrib['code'] = 'a'
newHolding866a.text = ", ".join(fakeHoldings852TextHoldings[aHolding])
fakeHoldingRecord.append(fakeHoldingRecord852)
fakeHoldingRecord.append(fakeHoldingRecord866)
holdingCollection.append(fakeHoldingRecord)
items = ET.SubElement(holding, "items")
itemContent = ET.SubElement(items, "content")
newItems = ET.SubElement(itemContent,'collection')
newItems.set('xmlns', 'http://www.loc.gov/MARC21/slim')
for aItem in newItemsDict[aHolding]:
newitem = ET.Element('record')
newitem.append(aItem[0])
newitem.append(aItem[1])
newItems.append(newitem)
#
# itemContent.append(newItems)
if (len(noCodes) > 0):
print("These customer codes were not found in the mapping")
print(noCodes)
with open(filename + '_converted.xml', 'w') as f:
f.write(pretty_print(ET.tostring(bibRecords, 'utf-8')))
if __name__ == '__main__':
if len(sys.argv) != 2:
print('Pass the name of the file to convert, example: "python3.4 recap_sample_convert.py marcEditOutput.xml"')
else:
convert(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment