Forked from danmichaelo/gist:d52035c4204cbe2b1c21c717102c3161
Created
November 4, 2019 13:22
-
-
Save mjlassila/73bec8543347efd0b977e50c33094b56 to your computer and use it in GitHub Desktop.
marc map.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# vim:set ft=perl:ts=4:sw=4 | |
# | |
# Ref.: http://librecat.org/Catmandu/ | |
# https://github.com/LibreCat/Catmandu/wiki/Example%20Fix%20Script | |
# https://github.com/scriptotek/simplemarcparser/blob/master/src/BibliographicRecord.php | |
# For ElasticSearch 2.0 | |
# See https://github.com/LibreCat/Catmandu-Store-Elasticsearch/commit/63795416d2585eab7af1d5263f5823b4cae94251 | |
# <s>Note that we use _identifier over _id to cover deleted records which do not have _id</s> | |
# UPDATE: When importing from a MARC dump, we don't have OAI IDs, so use the simple _ids instead. | |
if exists(_identifier) | |
# For some reason, we also get about 18000 records with wrong identifier as duplicates. Reject those. | |
# unless all_match(_identifier,'^oai:ubo.bibsys.no') | |
unless all_match(_identifier,'^oai:urm_publish') | |
reject() | |
end | |
end | |
# Fix _id for deleted records | |
# replace_all(_id,"oai:ubo.bibsys.no:","") | |
replace_all(_id,"oai:urm_publish:","") | |
move_field(_id, oai_id) | |
#if exists(_id) | |
#remove_field(_id) | |
# Note: We need to store both the OAI record ID ('oai:ubo.bibsys.no:990000380754702204') | |
# and the bibliographic record id ('990000310544702204') since in the case of deleted records | |
# we will only get the OAI record ID. And it's quite useful to be able to match that with the | |
# original record it. | |
timestamp(timestamp) # set the key 'timestamp' to the current time (unix timestamp) | |
# IDs | |
set_array(any_id) | |
marc_map(001, mms_id) | |
marc_map(001, any_id.$append) | |
marc_map(020a, any_id.$append) | |
marc_map(020e, any_id.$append) # Alma MARC | |
marc_map(035, any_id.$append) | |
replace_all(any_id.*, '^\(.*\)(.*)$', '$1') # remove parenthesis prefix | |
# LCCN | |
marc_map(010a, lccn) | |
# ISBN/ISSN | |
marc_map('020a','isbn.$append', join:'==') | |
marc_map('020e','isbn.$append', join:'==') # Alma MARC | |
marc_map('022a','issn.$append', join:'==') | |
join_field('isbn','==') | |
split_field('isbn','==') | |
join_field('issn','==') | |
split_field('issn','==') | |
replace_all('isbn.*','^([0-9xX-]+).*$','$1') | |
replace_all('issn.*','^([0-9xX-]+).*','$1') | |
# Material | |
if marc_match('LDR/6', 'a') | |
if marc_match('LDR/7', '[acdm]') | |
set_field('material', 'book') | |
end | |
end | |
marc_map(007/0-1,mat_desig) | |
lookup('mat_desig', 'field_007.csv', default:'Unknown') | |
# Language | |
marc_map('008_/35-37','lang') | |
if all_match('lang','\W+') | |
set_field('lang','und') # undefined | |
end | |
# Title | |
marc_spec('245$a$b','title', join:' ') | |
replace_all('title','\[(.*)\]','$1') | |
replace_all('title','((\s+\W\s*)+|\.)$','') | |
copy_field('title','title_sort') | |
replace_all('title_sort','\W+','') | |
substring('title_sort',0,50) | |
downcase('title_sort') | |
copy_field('title','title') | |
marc_map('246','title_remainder', join:' ') | |
marc_map('245a','title_short') | |
replace_all('title_short','((\s+\W\s*)+|\.)$','') | |
#- Authors / contributors | |
marc_map('100ab','author.$append', join:' ') | |
marc_map('700ab','author.$append', join:' ') | |
unless all_match('type','phd|master|bachelor') | |
marc_map('720ab','author.$append', join:' ') | |
end | |
# author_names() | |
copy_field('author','author') | |
#- Imprint | |
marc_map('260a','place_of_publication') | |
marc_map('260b','publisher') | |
marc_map('008_/7-10','008_year') | |
if all_match('008_year','[u^?-]{4}') | |
remove_field('008_year') | |
end | |
replace_all('008_year','\D','0') | |
if all_match(008_year,'^.+$') # need to check if not empty, because greater_than will crash if given an empty string | |
if greater_than(008_year, 2025) | |
remove_field('008_year') | |
end | |
end | |
marc_map('260c','pub_year') | |
replace_all('pub_year','\D','') # to remove c., [], etc.. | |
if all_match(pub_year,'^.+$') # need to check if not empty, because greater_than will crash if given an empty string | |
if greater_than(pub_year, 2025) | |
if exists(008_year) | |
copy_field(008_year, pub_year) | |
end | |
end | |
end | |
#if greater_than('2025','year') | |
# remove_field('year') | |
#end | |
if marc_match('008_/6-6','b') | |
prepend('008_year','-') | |
end | |
#- Edition | |
marc_map('250a','edition') | |
#- Description | |
marc_map('300a','desc_extend') | |
#- Summary | |
marc_map('505a','summary.$append', join:"\n") | |
marc_map('520a','summary.$append', join:"\n") | |
set_array('subjects') # objects for presentations | |
# form/genre | |
set_array('forms') | |
set_array('form.noubomn') | |
set_array('form.humord') | |
set_array('sub') # the terms -- for search | |
set_array('noubomn') | |
set_array('noubomr') | |
set_array('noubojur') | |
set_array('humord') | |
set_array('tekord') | |
set_array('mesh') | |
set_array('lcsh') | |
set_array('lklass') | |
set_array('ddc_raw') | |
set_array('ddc') | |
do marc_each() | |
# IDs | |
if marc_match(035, 'EXLNZ-47BIBSYS_NETWORK') | |
marc_map(035, nz_mms_id) | |
replace_all(nz_mms_id, '^\(.*\)(.*)$', '$1') # remove parenthesis prefix | |
end | |
# Classification | |
if marc_has('060') | |
marc_map(060a, tmp.number) | |
set_field(tmp.system, 'nlm') | |
copy_field('tmp','classifications.$append') | |
remove_field('tmp') | |
end | |
if marc_has('080') | |
marc_map(080a, tmp.number) | |
marc_map(0802, tmp.edition) | |
set_field(tmp.system, 'udc') | |
copy_field(tmp, classifications.$append) | |
remove_field('tmp') | |
end | |
if marc_has('082') | |
marc_map(082a, tmp.number) | |
marc_map(082a, tmp.number_numeric) | |
replace_all(tmp.number_numeric, '[^0-9.]', '') | |
marc_map(082q, tmp.assigner) | |
marc_map(0822, tmp.edition) | |
set_field(tmp.system, 'ddc') | |
copy_field('tmp','classifications.$append') | |
copy_field('tmp.number', 'ddc_raw.$append') | |
copy_field('tmp.number_numeric', 'ddc.$append') | |
remove_field('tmp') | |
end | |
if marc_has('083') | |
marc_map(083a, tmp.number) # todo: join $a $c with hyphen, and $z... plus repeating numbers | |
marc_map(083a, tmp.number_numeric) | |
replace_all(tmp.number_numeric, '[^0-9.]', '') | |
marc_map(083q, tmp.assigner) | |
marc_map(0832, tmp.edition) | |
set_field(tmp.system, 'ddc') | |
copy_field('tmp','classifications.$append') | |
copy_field('tmp.number', 'ddc_raw.$append') | |
copy_field('tmp.number_numeric', 'ddc.$append') | |
remove_field('tmp') | |
end | |
if marc_has('084') | |
marc_map(084a, tmp.number) | |
marc_map(084q, tmp.assigner) | |
marc_map(0842, tmp.system) | |
copy_field('tmp','classifications.$append') | |
if marc_match('6**2', 'utklklass') | |
copy_field('tmp.number', 'lklass.$append') | |
end | |
remove_field('tmp') | |
end | |
# Subject fields | |
if marc_has('6**') | |
marc_map(6**abvxyz, tmp.term, join: ' : ') | |
marc_map(6**2, tmp.vocabulary) | |
marc_map(6**0, tmp.id) | |
if marc_match('6**[,2]', '0') | |
set_field(tmp.vocabulary, 'lcsh') | |
end | |
if marc_match('6**[,2]', '2') | |
set_field(tmp.vocabulary, 'mesh') | |
end | |
if marc_has('648') | |
set_field(tmp.type, '648') | |
end | |
if marc_has('650') | |
set_field(tmp.type, '650') | |
end | |
if marc_has('651') | |
set_field(tmp.type, '651') | |
end | |
if marc_has('655') | |
set_field(tmp.type, '655') | |
end | |
if all_equal(tmp.type, '655') | |
copy_field('tmp','forms.$append') | |
copy_field('tmp.term', 'form.$append') | |
if all_equal(tmp.vocabulary, 'noubomn') | |
copy_field('tmp.term', 'form(noubomn).$append') | |
end | |
if all_equal(tmp.vocabulary, 'humord') | |
copy_field('tmp.term', 'form(humord).$append') | |
end | |
end | |
unless marc_has('655') | |
copy_field('tmp','subjects.$append') | |
copy_field('tmp.term', 'sub.$append') | |
if all_equal(tmp.vocabulary, 'lcsh') | |
copy_field('tmp.term', 'lcsh.$append') | |
end | |
if all_equal(tmp.vocabulary, 'mesh') | |
copy_field('tmp.term', 'mesh.$append') | |
end | |
if all_equal(tmp.vocabulary, 'noubomn') | |
copy_field('tmp.term', 'noubomn.$append') | |
end | |
if all_equal(tmp.vocabulary, 'noubomr') | |
copy_field('tmp.term', 'noubomr.$append') | |
end | |
if all_equal(tmp.vocabulary, 'noubojur') | |
copy_field('tmp.term', 'noubojur.$append') | |
end | |
if all_equal(tmp.vocabulary, 'humord') | |
copy_field('tmp.term', 'humord.$append') | |
end | |
if all_equal(tmp.vocabulary, 'tekord') | |
copy_field('tmp.term', 'tekord.$append') | |
end | |
end | |
remove_field('tmp') | |
end | |
# Enrichments | |
if marc_has('856') | |
marc_map(856u, tmp.url) | |
marc_map(8563, tmp.type) | |
copy_field('tmp','enrichments.$append') | |
remove_field('tmp') | |
end | |
# Holdings | |
if marc_has('866') | |
marc_map(866a, tmp.description) | |
marc_map(866z, tmp.public_note) | |
copy_field('tmp','holdings.$append') | |
remove_field('tmp') | |
end | |
if marc_has('AVA') | |
marc_map(AVAb, tmp.library) | |
marc_map(AVAc, tmp.collection) | |
marc_map(AVAd, tmp.call_number) | |
marc_map(AVAe, tmp.item_status) | |
marc_map(AVAf, tmp.item_policy) | |
marc_map(AVAt, tmp.material_type) | |
marc_map(AVAi, tmp.chron_i) | |
marc_map(AVAj, tmp.chron_j) | |
marc_map(AVAm, tmp.enum_a) | |
marc_map(AVAn, tmp.enum_b) | |
marc_map(AVAp, tmp.barcode) | |
marc_map(AVAy, tmp.fullfilment_note) | |
marc_map(AVAg, tmp.create_date) | |
marc_map(AVAh, tmp.update_date) | |
marc_map(AVAw, tmp.due_back_date) | |
marc_map(AVAu, tmp.po_line_number) | |
marc_map(AVAv, tmp.receiving_date) | |
marc_map(AVA0, tmp.id) | |
marc_map(AVA1, tmp.holding_id) | |
copy_field('tmp','items.$append') | |
remove_field('tmp') | |
end | |
end | |
# sort_field('subjects', uniq:1) | |
remove_field(record) | |
vacuum() # Important to run after we have done all the marc processing, since it will invalidate the MARC JSON |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment