Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjlassila/73bec8543347efd0b977e50c33094b56 to your computer and use it in GitHub Desktop.
Save mjlassila/73bec8543347efd0b977e50c33094b56 to your computer and use it in GitHub Desktop.
marc map.txt
# vim:set ft=perl:ts=4:sw=4
#
# Ref.: http://librecat.org/Catmandu/
# https://github.com/LibreCat/Catmandu/wiki/Example%20Fix%20Script
# https://github.com/scriptotek/simplemarcparser/blob/master/src/BibliographicRecord.php
# For ElasticSearch 2.0
# See https://github.com/LibreCat/Catmandu-Store-Elasticsearch/commit/63795416d2585eab7af1d5263f5823b4cae94251
# <s>Note that we use _identifier over _id to cover deleted records which do not have _id</s>
# UPDATE: When importing from a MARC dump, we don't have OAI IDs, so use the simple _ids instead.
if exists(_identifier)
# For some reason, we also get about 18000 records with wrong identifier as duplicates. Reject those.
# unless all_match(_identifier,'^oai:ubo.bibsys.no')
unless all_match(_identifier,'^oai:urm_publish')
reject()
end
end
# Fix _id for deleted records
# replace_all(_id,"oai:ubo.bibsys.no:","")
replace_all(_id,"oai:urm_publish:","")
move_field(_id, oai_id)
#if exists(_id)
#remove_field(_id)
# Note: We need to store both the OAI record ID ('oai:ubo.bibsys.no:990000380754702204')
# and the bibliographic record id ('990000310544702204') since in the case of deleted records
# we will only get the OAI record ID. And it's quite useful to be able to match that with the
# original record it.
timestamp(timestamp) # set the key 'timestamp' to the current time (unix timestamp)
# IDs
set_array(any_id)
marc_map(001, mms_id)
marc_map(001, any_id.$append)
marc_map(020a, any_id.$append)
marc_map(020e, any_id.$append) # Alma MARC
marc_map(035, any_id.$append)
replace_all(any_id.*, '^\(.*\)(.*)$', '$1') # remove parenthesis prefix
# LCCN
marc_map(010a, lccn)
# ISBN/ISSN
marc_map('020a','isbn.$append', join:'==')
marc_map('020e','isbn.$append', join:'==') # Alma MARC
marc_map('022a','issn.$append', join:'==')
join_field('isbn','==')
split_field('isbn','==')
join_field('issn','==')
split_field('issn','==')
replace_all('isbn.*','^([0-9xX-]+).*$','$1')
replace_all('issn.*','^([0-9xX-]+).*','$1')
# Material
if marc_match('LDR/6', 'a')
if marc_match('LDR/7', '[acdm]')
set_field('material', 'book')
end
end
marc_map(007/0-1,mat_desig)
lookup('mat_desig', 'field_007.csv', default:'Unknown')
# Language
marc_map('008_/35-37','lang')
if all_match('lang','\W+')
set_field('lang','und') # undefined
end
# Title
marc_spec('245$a$b','title', join:' ')
replace_all('title','\[(.*)\]','$1')
replace_all('title','((\s+\W\s*)+|\.)$','')
copy_field('title','title_sort')
replace_all('title_sort','\W+','')
substring('title_sort',0,50)
downcase('title_sort')
copy_field('title','title')
marc_map('246','title_remainder', join:' ')
marc_map('245a','title_short')
replace_all('title_short','((\s+\W\s*)+|\.)$','')
#- Authors / contributors
marc_map('100ab','author.$append', join:' ')
marc_map('700ab','author.$append', join:' ')
unless all_match('type','phd|master|bachelor')
marc_map('720ab','author.$append', join:' ')
end
# author_names()
copy_field('author','author')
#- Imprint
marc_map('260a','place_of_publication')
marc_map('260b','publisher')
marc_map('008_/7-10','008_year')
if all_match('008_year','[u^?-]{4}')
remove_field('008_year')
end
replace_all('008_year','\D','0')
if all_match(008_year,'^.+$') # need to check if not empty, because greater_than will crash if given an empty string
if greater_than(008_year, 2025)
remove_field('008_year')
end
end
marc_map('260c','pub_year')
replace_all('pub_year','\D','') # to remove c., [], etc..
if all_match(pub_year,'^.+$') # need to check if not empty, because greater_than will crash if given an empty string
if greater_than(pub_year, 2025)
if exists(008_year)
copy_field(008_year, pub_year)
end
end
end
#if greater_than('2025','year')
# remove_field('year')
#end
if marc_match('008_/6-6','b')
prepend('008_year','-')
end
#- Edition
marc_map('250a','edition')
#- Description
marc_map('300a','desc_extend')
#- Summary
marc_map('505a','summary.$append', join:"\n")
marc_map('520a','summary.$append', join:"\n")
set_array('subjects') # objects for presentations
# form/genre
set_array('forms')
set_array('form.noubomn')
set_array('form.humord')
set_array('sub') # the terms -- for search
set_array('noubomn')
set_array('noubomr')
set_array('noubojur')
set_array('humord')
set_array('tekord')
set_array('mesh')
set_array('lcsh')
set_array('lklass')
set_array('ddc_raw')
set_array('ddc')
do marc_each()
# IDs
if marc_match(035, 'EXLNZ-47BIBSYS_NETWORK')
marc_map(035, nz_mms_id)
replace_all(nz_mms_id, '^\(.*\)(.*)$', '$1') # remove parenthesis prefix
end
# Classification
if marc_has('060')
marc_map(060a, tmp.number)
set_field(tmp.system, 'nlm')
copy_field('tmp','classifications.$append')
remove_field('tmp')
end
if marc_has('080')
marc_map(080a, tmp.number)
marc_map(0802, tmp.edition)
set_field(tmp.system, 'udc')
copy_field(tmp, classifications.$append)
remove_field('tmp')
end
if marc_has('082')
marc_map(082a, tmp.number)
marc_map(082a, tmp.number_numeric)
replace_all(tmp.number_numeric, '[^0-9.]', '')
marc_map(082q, tmp.assigner)
marc_map(0822, tmp.edition)
set_field(tmp.system, 'ddc')
copy_field('tmp','classifications.$append')
copy_field('tmp.number', 'ddc_raw.$append')
copy_field('tmp.number_numeric', 'ddc.$append')
remove_field('tmp')
end
if marc_has('083')
marc_map(083a, tmp.number) # todo: join $a $c with hyphen, and $z... plus repeating numbers
marc_map(083a, tmp.number_numeric)
replace_all(tmp.number_numeric, '[^0-9.]', '')
marc_map(083q, tmp.assigner)
marc_map(0832, tmp.edition)
set_field(tmp.system, 'ddc')
copy_field('tmp','classifications.$append')
copy_field('tmp.number', 'ddc_raw.$append')
copy_field('tmp.number_numeric', 'ddc.$append')
remove_field('tmp')
end
if marc_has('084')
marc_map(084a, tmp.number)
marc_map(084q, tmp.assigner)
marc_map(0842, tmp.system)
copy_field('tmp','classifications.$append')
if marc_match('6**2', 'utklklass')
copy_field('tmp.number', 'lklass.$append')
end
remove_field('tmp')
end
# Subject fields
if marc_has('6**')
marc_map(6**abvxyz, tmp.term, join: ' : ')
marc_map(6**2, tmp.vocabulary)
marc_map(6**0, tmp.id)
if marc_match('6**[,2]', '0')
set_field(tmp.vocabulary, 'lcsh')
end
if marc_match('6**[,2]', '2')
set_field(tmp.vocabulary, 'mesh')
end
if marc_has('648')
set_field(tmp.type, '648')
end
if marc_has('650')
set_field(tmp.type, '650')
end
if marc_has('651')
set_field(tmp.type, '651')
end
if marc_has('655')
set_field(tmp.type, '655')
end
if all_equal(tmp.type, '655')
copy_field('tmp','forms.$append')
copy_field('tmp.term', 'form.$append')
if all_equal(tmp.vocabulary, 'noubomn')
copy_field('tmp.term', 'form(noubomn).$append')
end
if all_equal(tmp.vocabulary, 'humord')
copy_field('tmp.term', 'form(humord).$append')
end
end
unless marc_has('655')
copy_field('tmp','subjects.$append')
copy_field('tmp.term', 'sub.$append')
if all_equal(tmp.vocabulary, 'lcsh')
copy_field('tmp.term', 'lcsh.$append')
end
if all_equal(tmp.vocabulary, 'mesh')
copy_field('tmp.term', 'mesh.$append')
end
if all_equal(tmp.vocabulary, 'noubomn')
copy_field('tmp.term', 'noubomn.$append')
end
if all_equal(tmp.vocabulary, 'noubomr')
copy_field('tmp.term', 'noubomr.$append')
end
if all_equal(tmp.vocabulary, 'noubojur')
copy_field('tmp.term', 'noubojur.$append')
end
if all_equal(tmp.vocabulary, 'humord')
copy_field('tmp.term', 'humord.$append')
end
if all_equal(tmp.vocabulary, 'tekord')
copy_field('tmp.term', 'tekord.$append')
end
end
remove_field('tmp')
end
# Enrichments
if marc_has('856')
marc_map(856u, tmp.url)
marc_map(8563, tmp.type)
copy_field('tmp','enrichments.$append')
remove_field('tmp')
end
# Holdings
if marc_has('866')
marc_map(866a, tmp.description)
marc_map(866z, tmp.public_note)
copy_field('tmp','holdings.$append')
remove_field('tmp')
end
if marc_has('AVA')
marc_map(AVAb, tmp.library)
marc_map(AVAc, tmp.collection)
marc_map(AVAd, tmp.call_number)
marc_map(AVAe, tmp.item_status)
marc_map(AVAf, tmp.item_policy)
marc_map(AVAt, tmp.material_type)
marc_map(AVAi, tmp.chron_i)
marc_map(AVAj, tmp.chron_j)
marc_map(AVAm, tmp.enum_a)
marc_map(AVAn, tmp.enum_b)
marc_map(AVAp, tmp.barcode)
marc_map(AVAy, tmp.fullfilment_note)
marc_map(AVAg, tmp.create_date)
marc_map(AVAh, tmp.update_date)
marc_map(AVAw, tmp.due_back_date)
marc_map(AVAu, tmp.po_line_number)
marc_map(AVAv, tmp.receiving_date)
marc_map(AVA0, tmp.id)
marc_map(AVA1, tmp.holding_id)
copy_field('tmp','items.$append')
remove_field('tmp')
end
end
# sort_field('subjects', uniq:1)
remove_field(record)
vacuum() # Important to run after we have done all the marc processing, since it will invalidate the MARC JSON
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment