mjlassila/gist:73bec8543347efd0b977e50c33094b56

## gistfile1.txt
# vim:set ft=perl:ts=4:sw=4
#
# Ref.: http://librecat.org/Catmandu/
# https://github.com/LibreCat/Catmandu/wiki/Example%20Fix%20Script
# https://github.com/scriptotek/simplemarcparser/blob/master/src/BibliographicRecord.php

# For ElasticSearch 2.0
# See https://github.com/LibreCat/Catmandu-Store-Elasticsearch/commit/63795416d2585eab7af1d5263f5823b4cae94251
# <s>Note that we use _identifier over _id to cover deleted records which do not have _id</s>
# UPDATE: When importing from a MARC dump, we don't have OAI IDs, so use the simple _ids instead.

if exists(_identifier)
    # For some reason, we also get about 18000 records with wrong identifier as duplicates. Reject those.
    # unless all_match(_identifier,'^oai:ubo.bibsys.no')
    unless all_match(_identifier,'^oai:urm_publish')
        reject()
    end
end

# Fix _id for deleted records
# replace_all(_id,"oai:ubo.bibsys.no:","")
replace_all(_id,"oai:urm_publish:","")

move_field(_id, oai_id)

#if exists(_id)
#remove_field(_id)

# Note: We need to store both the OAI record ID ('oai:ubo.bibsys.no:990000380754702204')
# and the bibliographic record id ('990000310544702204') since in the case of deleted records
# we will only get the OAI record ID. And it's quite useful to be able to match that with the
# original record it.

timestamp(timestamp)  # set the key 'timestamp' to the current time (unix timestamp)

# IDs

    set_array(any_id)

    marc_map(001, mms_id)
    marc_map(001, any_id.$append)
    marc_map(020a, any_id.$append)
    marc_map(020e, any_id.$append)  # Alma MARC
    marc_map(035, any_id.$append)
    replace_all(any_id.*, '^\(.*\)(.*)$', '$1')  # remove parenthesis prefix

# LCCN

    marc_map(010a, lccn)

# ISBN/ISSN

    marc_map('020a','isbn.$append', join:'==')
    marc_map('020e','isbn.$append', join:'==')   # Alma MARC
    marc_map('022a','issn.$append', join:'==')
    join_field('isbn','==')
    split_field('isbn','==')
    join_field('issn','==')
    split_field('issn','==')
    replace_all('isbn.*','^([0-9xX-]+).*$','$1')
    replace_all('issn.*','^([0-9xX-]+).*','$1')

# Material

    if marc_match('LDR/6', 'a')
        if marc_match('LDR/7', '[acdm]')
            set_field('material', 'book')
        end
    end

    marc_map(007/0-1,mat_desig)
    lookup('mat_desig', 'field_007.csv', default:'Unknown')


# Language

    marc_map('008_/35-37','lang')
    if all_match('lang','\W+')
        set_field('lang','und')  # undefined
    end


# Title

    marc_spec('245$a$b','title', join:' ')
    replace_all('title','\[(.*)\]','$1')
    replace_all('title','((\s+\W\s*)+|\.)$','')
    copy_field('title','title_sort')
    replace_all('title_sort','\W+','')
    substring('title_sort',0,50)
    downcase('title_sort')
    copy_field('title','title')
    marc_map('246','title_remainder', join:' ')
    marc_map('245a','title_short')
    replace_all('title_short','((\s+\W\s*)+|\.)$','')

#- Authors / contributors

    marc_map('100ab','author.$append', join:' ')
    marc_map('700ab','author.$append', join:' ')
    unless all_match('type','phd|master|bachelor')
        marc_map('720ab','author.$append', join:' ')
    end
    # author_names()
    copy_field('author','author')

#- Imprint

    marc_map('260a','place_of_publication')
    marc_map('260b','publisher')

    marc_map('008_/7-10','008_year')
    if all_match('008_year','[u^?-]{4}')
       remove_field('008_year')
    end
    replace_all('008_year','\D','0')
    if all_match(008_year,'^.+$')  # need to check if not empty, because greater_than will crash if given an empty string
        if greater_than(008_year, 2025)
            remove_field('008_year')
        end
    end

    marc_map('260c','pub_year')
    replace_all('pub_year','\D','')   # to remove c., [], etc..
    if all_match(pub_year,'^.+$')  # need to check if not empty, because greater_than will crash if given an empty string
        if greater_than(pub_year, 2025)
            if exists(008_year)
                copy_field(008_year, pub_year)
            end
        end
    end

    #if greater_than('2025','year')
    #    remove_field('year')
    #end

    if marc_match('008_/6-6','b')
        prepend('008_year','-')
    end

#- Edition
    marc_map('250a','edition')

#- Description
    marc_map('300a','desc_extend')

#- Summary
    marc_map('505a','summary.$append', join:"\n")
    marc_map('520a','summary.$append', join:"\n")


set_array('subjects')  # objects for presentations

# form/genre
set_array('forms')
set_array('form.noubomn')
set_array('form.humord')

set_array('sub')   # the terms -- for search
set_array('noubomn')
set_array('noubomr')
set_array('noubojur')
set_array('humord')
set_array('tekord')
set_array('mesh')
set_array('lcsh')
set_array('lklass')
set_array('ddc_raw')
set_array('ddc')

do marc_each()

    # IDs
    if marc_match(035, 'EXLNZ-47BIBSYS_NETWORK')
        marc_map(035, nz_mms_id)
        replace_all(nz_mms_id, '^\(.*\)(.*)$', '$1')  # remove parenthesis prefix
    end

   # Classification

   if marc_has('060')
     marc_map(060a, tmp.number)
     set_field(tmp.system, 'nlm')
     copy_field('tmp','classifications.$append')
     remove_field('tmp')
   end

   if marc_has('080')
     marc_map(080a, tmp.number)
     marc_map(0802, tmp.edition)
     set_field(tmp.system, 'udc')
     copy_field(tmp, classifications.$append)
     remove_field('tmp')
   end

   if marc_has('082')
     marc_map(082a, tmp.number)
     marc_map(082a, tmp.number_numeric)
     replace_all(tmp.number_numeric, '[^0-9.]', '')
     marc_map(082q, tmp.assigner)
     marc_map(0822, tmp.edition)
     set_field(tmp.system, 'ddc')
     copy_field('tmp','classifications.$append')
     copy_field('tmp.number', 'ddc_raw.$append')
     copy_field('tmp.number_numeric', 'ddc.$append')
     remove_field('tmp')
   end

   if marc_has('083')
     marc_map(083a, tmp.number)  # todo: join $a $c with hyphen, and $z... plus repeating numbers
     marc_map(083a, tmp.number_numeric)
     replace_all(tmp.number_numeric, '[^0-9.]', '')
     marc_map(083q, tmp.assigner)
     marc_map(0832, tmp.edition)
     set_field(tmp.system, 'ddc')
     copy_field('tmp','classifications.$append')
     copy_field('tmp.number', 'ddc_raw.$append')
     copy_field('tmp.number_numeric', 'ddc.$append')
     remove_field('tmp')
   end

   if marc_has('084')
     marc_map(084a, tmp.number)
     marc_map(084q, tmp.assigner)
     marc_map(0842, tmp.system)
     copy_field('tmp','classifications.$append')
     if marc_match('6**2', 'utklklass')
       copy_field('tmp.number', 'lklass.$append')
     end
     remove_field('tmp')
   end


   # Subject fields
   if marc_has('6**')

     marc_map(6**abvxyz, tmp.term, join: ' : ')
     marc_map(6**2, tmp.vocabulary)
     marc_map(6**0, tmp.id)
     if marc_match('6**[,2]', '0')
       set_field(tmp.vocabulary, 'lcsh')
     end
     if marc_match('6**[,2]', '2')
       set_field(tmp.vocabulary, 'mesh')
     end
     if marc_has('648')
       set_field(tmp.type, '648')
     end
     if marc_has('650')
       set_field(tmp.type, '650')
     end
     if marc_has('651')
       set_field(tmp.type, '651')
     end
     if marc_has('655')
       set_field(tmp.type, '655')
     end

     if all_equal(tmp.type, '655')
       copy_field('tmp','forms.$append')
       copy_field('tmp.term', 'form.$append')
       if all_equal(tmp.vocabulary, 'noubomn')
         copy_field('tmp.term', 'form(noubomn).$append')
       end
       if all_equal(tmp.vocabulary, 'humord')
         copy_field('tmp.term', 'form(humord).$append')
       end
     end

     unless marc_has('655')
       copy_field('tmp','subjects.$append')
       copy_field('tmp.term', 'sub.$append')
       if all_equal(tmp.vocabulary, 'lcsh')
         copy_field('tmp.term', 'lcsh.$append')
       end
       if all_equal(tmp.vocabulary, 'mesh')
         copy_field('tmp.term', 'mesh.$append')
       end
       if all_equal(tmp.vocabulary, 'noubomn')
         copy_field('tmp.term', 'noubomn.$append')
       end
       if all_equal(tmp.vocabulary, 'noubomr')
         copy_field('tmp.term', 'noubomr.$append')
       end
       if all_equal(tmp.vocabulary, 'noubojur')
         copy_field('tmp.term', 'noubojur.$append')
       end
       if all_equal(tmp.vocabulary, 'humord')
         copy_field('tmp.term', 'humord.$append')
       end
       if all_equal(tmp.vocabulary, 'tekord')
         copy_field('tmp.term', 'tekord.$append')
       end
     end

     remove_field('tmp')
   end

   # Enrichments

   if marc_has('856')
     marc_map(856u, tmp.url)
     marc_map(8563, tmp.type)
     copy_field('tmp','enrichments.$append')
     remove_field('tmp')
   end

   # Holdings

   if marc_has('866')
     marc_map(866a, tmp.description)
     marc_map(866z, tmp.public_note)
     copy_field('tmp','holdings.$append')
     remove_field('tmp')
   end

   if marc_has('AVA')
     marc_map(AVAb, tmp.library)
     marc_map(AVAc, tmp.collection)
     marc_map(AVAd, tmp.call_number)
     marc_map(AVAe, tmp.item_status)
     marc_map(AVAf, tmp.item_policy)
     marc_map(AVAt, tmp.material_type)

     marc_map(AVAi, tmp.chron_i)
     marc_map(AVAj, tmp.chron_j)
     marc_map(AVAm, tmp.enum_a)
     marc_map(AVAn, tmp.enum_b)

     marc_map(AVAp, tmp.barcode)
     marc_map(AVAy, tmp.fullfilment_note)
     marc_map(AVAg, tmp.create_date)
     marc_map(AVAh, tmp.update_date)
     marc_map(AVAw, tmp.due_back_date)

     marc_map(AVAu, tmp.po_line_number)
     marc_map(AVAv, tmp.receiving_date)

     marc_map(AVA0, tmp.id)
     marc_map(AVA1, tmp.holding_id)

     copy_field('tmp','items.$append')
     remove_field('tmp')
   end

end

# sort_field('subjects', uniq:1)

remove_field(record)
vacuum()  # Important to run after we have done all the marc processing, since it will invalidate the MARC JSON
	# vim:set ft=perl:ts=4:sw=4
	#
	# Ref.: http://librecat.org/Catmandu/
	# https://github.com/LibreCat/Catmandu/wiki/Example%20Fix%20Script
	# https://github.com/scriptotek/simplemarcparser/blob/master/src/BibliographicRecord.php

	# For ElasticSearch 2.0
	# See https://github.com/LibreCat/Catmandu-Store-Elasticsearch/commit/63795416d2585eab7af1d5263f5823b4cae94251
	# <s>Note that we use _identifier over _id to cover deleted records which do not have _id</s>
	# UPDATE: When importing from a MARC dump, we don't have OAI IDs, so use the simple _ids instead.

	if exists(_identifier)
	# For some reason, we also get about 18000 records with wrong identifier as duplicates. Reject those.
	# unless all_match(_identifier,'^oai:ubo.bibsys.no')
	unless all_match(_identifier,'^oai:urm_publish')
	reject()
	end
	end

	# Fix _id for deleted records
	# replace_all(_id,"oai:ubo.bibsys.no:","")
	replace_all(_id,"oai:urm_publish:","")

	move_field(_id, oai_id)

	#if exists(_id)
	#remove_field(_id)

	# Note: We need to store both the OAI record ID ('oai:ubo.bibsys.no:990000380754702204')
	# and the bibliographic record id ('990000310544702204') since in the case of deleted records
	# we will only get the OAI record ID. And it's quite useful to be able to match that with the
	# original record it.

	timestamp(timestamp) # set the key 'timestamp' to the current time (unix timestamp)

	# IDs

	set_array(any_id)

	marc_map(001, mms_id)
	marc_map(001, any_id.$append)
	marc_map(020a, any_id.$append)
	marc_map(020e, any_id.$append) # Alma MARC
	marc_map(035, any_id.$append)
	replace_all(any_id., '^\(.\)(.*)$', '$1') # remove parenthesis prefix

	# LCCN

	marc_map(010a, lccn)

	# ISBN/ISSN

	marc_map('020a','isbn.$append', join:'==')
	marc_map('020e','isbn.$append', join:'==') # Alma MARC
	marc_map('022a','issn.$append', join:'==')
	join_field('isbn','==')
	split_field('isbn','==')
	join_field('issn','==')
	split_field('issn','==')
	replace_all('isbn.','^([0-9xX-]+).$','$1')
	replace_all('issn.','^([0-9xX-]+).','$1')

	# Material

	if marc_match('LDR/6', 'a')
	if marc_match('LDR/7', '[acdm]')
	set_field('material', 'book')
	end
	end

	marc_map(007/0-1,mat_desig)
	lookup('mat_desig', 'field_007.csv', default:'Unknown')


	# Language

	marc_map('008_/35-37','lang')
	if all_match('lang','\W+')
	set_field('lang','und') # undefined
	end


	# Title

	marc_spec('245$a$b','title', join:' ')
	replace_all('title','\[(.*)\]','$1')
	replace_all('title','((\s+\W\s*)+\|\.)$','')
	copy_field('title','title_sort')
	replace_all('title_sort','\W+','')
	substring('title_sort',0,50)
	downcase('title_sort')
	copy_field('title','title')
	marc_map('246','title_remainder', join:' ')
	marc_map('245a','title_short')
	replace_all('title_short','((\s+\W\s*)+\|\.)$','')

	#- Authors / contributors

	marc_map('100ab','author.$append', join:' ')
	marc_map('700ab','author.$append', join:' ')
	unless all_match('type','phd\|master\|bachelor')
	marc_map('720ab','author.$append', join:' ')
	end
	# author_names()
	copy_field('author','author')

	#- Imprint

	marc_map('260a','place_of_publication')
	marc_map('260b','publisher')

	marc_map('008_/7-10','008_year')
	if all_match('008_year','[u^?-]{4}')
	remove_field('008_year')
	end
	replace_all('008_year','\D','0')
	if all_match(008_year,'^.+$') # need to check if not empty, because greater_than will crash if given an empty string
	if greater_than(008_year, 2025)
	remove_field('008_year')
	end
	end

	marc_map('260c','pub_year')
	replace_all('pub_year','\D','') # to remove c., [], etc..
	if all_match(pub_year,'^.+$') # need to check if not empty, because greater_than will crash if given an empty string
	if greater_than(pub_year, 2025)
	if exists(008_year)
	copy_field(008_year, pub_year)
	end
	end
	end

	#if greater_than('2025','year')
	# remove_field('year')
	#end

	if marc_match('008_/6-6','b')
	prepend('008_year','-')
	end

	#- Edition
	marc_map('250a','edition')

	#- Description
	marc_map('300a','desc_extend')

	#- Summary
	marc_map('505a','summary.$append', join:"\n")
	marc_map('520a','summary.$append', join:"\n")




	set_array('subjects') # objects for presentations

	# form/genre
	set_array('forms')
	set_array('form.noubomn')
	set_array('form.humord')

	set_array('sub') # the terms -- for search
	set_array('noubomn')
	set_array('noubomr')
	set_array('noubojur')
	set_array('humord')
	set_array('tekord')
	set_array('mesh')
	set_array('lcsh')
	set_array('lklass')
	set_array('ddc_raw')
	set_array('ddc')

	do marc_each()

	# IDs
	if marc_match(035, 'EXLNZ-47BIBSYS_NETWORK')
	marc_map(035, nz_mms_id)
	replace_all(nz_mms_id, '^\(.\)(.)$', '$1') # remove parenthesis prefix
	end

	# Classification

	if marc_has('060')
	marc_map(060a, tmp.number)
	set_field(tmp.system, 'nlm')
	copy_field('tmp','classifications.$append')
	remove_field('tmp')
	end

	if marc_has('080')
	marc_map(080a, tmp.number)
	marc_map(0802, tmp.edition)
	set_field(tmp.system, 'udc')
	copy_field(tmp, classifications.$append)
	remove_field('tmp')
	end

	if marc_has('082')
	marc_map(082a, tmp.number)
	marc_map(082a, tmp.number_numeric)
	replace_all(tmp.number_numeric, '[^0-9.]', '')
	marc_map(082q, tmp.assigner)
	marc_map(0822, tmp.edition)
	set_field(tmp.system, 'ddc')
	copy_field('tmp','classifications.$append')
	copy_field('tmp.number', 'ddc_raw.$append')
	copy_field('tmp.number_numeric', 'ddc.$append')
	remove_field('tmp')
	end

	if marc_has('083')
	marc_map(083a, tmp.number) # todo: join $a $c with hyphen, and $z... plus repeating numbers
	marc_map(083a, tmp.number_numeric)
	replace_all(tmp.number_numeric, '[^0-9.]', '')
	marc_map(083q, tmp.assigner)
	marc_map(0832, tmp.edition)
	set_field(tmp.system, 'ddc')
	copy_field('tmp','classifications.$append')
	copy_field('tmp.number', 'ddc_raw.$append')
	copy_field('tmp.number_numeric', 'ddc.$append')
	remove_field('tmp')
	end

	if marc_has('084')
	marc_map(084a, tmp.number)
	marc_map(084q, tmp.assigner)
	marc_map(0842, tmp.system)
	copy_field('tmp','classifications.$append')
	if marc_match('6**2', 'utklklass')
	copy_field('tmp.number', 'lklass.$append')
	end
	remove_field('tmp')
	end


	# Subject fields
	if marc_has('6**')

	marc_map(6**abvxyz, tmp.term, join: ' : ')
	marc_map(6**2, tmp.vocabulary)
	marc_map(6**0, tmp.id)
	if marc_match('6**[,2]', '0')
	set_field(tmp.vocabulary, 'lcsh')
	end
	if marc_match('6**[,2]', '2')
	set_field(tmp.vocabulary, 'mesh')
	end
	if marc_has('648')
	set_field(tmp.type, '648')
	end
	if marc_has('650')
	set_field(tmp.type, '650')
	end
	if marc_has('651')
	set_field(tmp.type, '651')
	end
	if marc_has('655')
	set_field(tmp.type, '655')
	end

	if all_equal(tmp.type, '655')
	copy_field('tmp','forms.$append')
	copy_field('tmp.term', 'form.$append')
	if all_equal(tmp.vocabulary, 'noubomn')
	copy_field('tmp.term', 'form(noubomn).$append')
	end
	if all_equal(tmp.vocabulary, 'humord')
	copy_field('tmp.term', 'form(humord).$append')
	end
	end

	unless marc_has('655')
	copy_field('tmp','subjects.$append')
	copy_field('tmp.term', 'sub.$append')
	if all_equal(tmp.vocabulary, 'lcsh')
	copy_field('tmp.term', 'lcsh.$append')
	end
	if all_equal(tmp.vocabulary, 'mesh')
	copy_field('tmp.term', 'mesh.$append')
	end
	if all_equal(tmp.vocabulary, 'noubomn')
	copy_field('tmp.term', 'noubomn.$append')
	end
	if all_equal(tmp.vocabulary, 'noubomr')
	copy_field('tmp.term', 'noubomr.$append')
	end
	if all_equal(tmp.vocabulary, 'noubojur')
	copy_field('tmp.term', 'noubojur.$append')
	end
	if all_equal(tmp.vocabulary, 'humord')
	copy_field('tmp.term', 'humord.$append')
	end
	if all_equal(tmp.vocabulary, 'tekord')
	copy_field('tmp.term', 'tekord.$append')
	end
	end

	remove_field('tmp')
	end

	# Enrichments

	if marc_has('856')
	marc_map(856u, tmp.url)
	marc_map(8563, tmp.type)
	copy_field('tmp','enrichments.$append')
	remove_field('tmp')
	end

	# Holdings

	if marc_has('866')
	marc_map(866a, tmp.description)
	marc_map(866z, tmp.public_note)
	copy_field('tmp','holdings.$append')
	remove_field('tmp')
	end

	if marc_has('AVA')
	marc_map(AVAb, tmp.library)
	marc_map(AVAc, tmp.collection)
	marc_map(AVAd, tmp.call_number)
	marc_map(AVAe, tmp.item_status)
	marc_map(AVAf, tmp.item_policy)
	marc_map(AVAt, tmp.material_type)

	marc_map(AVAi, tmp.chron_i)
	marc_map(AVAj, tmp.chron_j)
	marc_map(AVAm, tmp.enum_a)
	marc_map(AVAn, tmp.enum_b)

	marc_map(AVAp, tmp.barcode)
	marc_map(AVAy, tmp.fullfilment_note)
	marc_map(AVAg, tmp.create_date)
	marc_map(AVAh, tmp.update_date)
	marc_map(AVAw, tmp.due_back_date)

	marc_map(AVAu, tmp.po_line_number)
	marc_map(AVAv, tmp.receiving_date)

	marc_map(AVA0, tmp.id)
	marc_map(AVA1, tmp.holding_id)

	copy_field('tmp','items.$append')
	remove_field('tmp')
	end

	end

	# sort_field('subjects', uniq:1)

	remove_field(record)
	vacuum() # Important to run after we have done all the marc processing, since it will invalidate the MARC JSON