|
require 'csv' |
|
require 'marc' |
|
|
|
### Normalization methods |
|
|
|
def get_heading_from_authority_field(field) |
|
heading_string = '' |
|
field.subfields.each do |subfield| |
|
next if subfield.value.nil? |
|
heading_string += ' ' if field.subfields.index(subfield) > 0 |
|
heading_string += subfield.value |
|
end |
|
heading_string.strip |
|
end |
|
|
|
def normalize_heading_for_local_search(heading:, normal_form: :nfd) |
|
val = heading.dup |
|
val.scrub!('') |
|
val.unicode_normalize!(normal_form) |
|
val.gsub!(/[\u0300-\u036f\[\]]/, '') |
|
val.downcase! |
|
val.gsub!(/[[:punct:][:space:]]/, '') |
|
val[0..1023] |
|
end |
|
|
|
### Create a hash where the normalized form is the key and the value is a hash with the following elements |
|
### element 1: 1xx tag from the authority record |
|
### element 2: 1xx field from the authority record |
|
|
|
authority_hash = {} |
|
|
|
Dir.glob('lcnames authority filepath').each do |file| |
|
reader = MARC::Reader.new(file) |
|
reader.each do |record| |
|
field = record.fields.select { |field| field.tag[0] == '1' }.first |
|
next if record['008'].value[9] == 'd' |
|
next unless record['008'].value[15] == 'a' |
|
|
|
heading = get_heading_from_authority_field(field) |
|
normalized = normalize_heading_for_local_search(heading: heading) |
|
authority_id = record['010']['a'] |
|
authority_hash[normalized] = {} |
|
authority_hash[normalized][:tag] = field.tag |
|
authority_hash[normalized][:field] = field |
|
end |
|
end |
|
|
|
Dir.glob('authority filepath').each do |file| |
|
reader = MARC::Reader.new(file) |
|
reader.each do |record| |
|
field = record.fields.select { |field| field.tag[0] == '1' }.first |
|
next if record['008'].value[9] == 'd' |
|
|
|
heading = get_heading_from_authority_field(field) |
|
normalized = normalize_heading_for_local_search(heading: heading) |
|
authority_id = record['010']['a'] |
|
authority_hash[normalized] = {} |
|
authority_hash[normalized][:tag] = field.tag |
|
authority_hash[normalized][:field] = field |
|
end |
|
end |
|
|
|
Dir.glob('lcsh authority filepath').each do |file| |
|
reader = MARC::Reader.new(file) |
|
reader.each do |record| |
|
field = record.fields.select { |field| field.tag[0] == '1' }.first |
|
next if field.tag == '155' |
|
next if record['008'].value[9] == 'd' |
|
next unless record['008'].value[15] == 'a' |
|
|
|
heading = get_heading_from_authority_field(field) |
|
normalized = normalize_heading_for_local_search(heading: heading) |
|
authority_id = record['010']['a'] |
|
authority_hash[normalized] = {} |
|
authority_hash[normalized][:tag] = field.tag |
|
authority_hash[normalized][:field] = field |
|
end |
|
end |
|
|
|
Dir.glob('lcgft authorities filepath').each do |file| |
|
reader = MARC::Reader.new(file) |
|
reader.each do |record| |
|
field = record.fields.select { |field| field.tag[0] == '1' }.first |
|
next unless record['008'].value[15] == 'a' |
|
|
|
heading = get_heading_from_authority_field(field) |
|
normalized = normalize_heading_for_local_search(heading: heading) |
|
authority_id = record['010']['a'] |
|
authority_hash[normalized] = {} |
|
authority_hash[normalized][:tag] = field.tag |
|
authority_hash[normalized][:field] = field |
|
end |
|
end |
|
|
|
unwanted_terms = [] |
|
input = File.open('filepath.txt', 'r') |
|
while line = input.gets |
|
line.chomp! |
|
normalized = normalize_heading_for_local_search(heading: line) |
|
unwanted_terms << normalized |
|
end |
|
unwanted_terms.uniq! |
|
|
|
# Iterate through each Cotsen MARC record. |
|
# For each 653, try to find the normalized form in the authority_hash. |
|
# If there is a match, replace the 653 field with the proper 6xx field from the authority record. |
|
# Write out a report with the changes and the retained fields for all records |
|
|
|
reader = MARC::XMLReader.new('path', parser:'magic', ignore_namespace: true) |
|
writer = MARC::XMLWriter.new('path') |
|
output = File.open('path', 'w') |
|
output.puts "MMS ID\tOriginal 653 Field\tNew 6xx Field" |
|
|
|
reader.each do |record| |
|
mmsid = record['001'].value |
|
match = false |
|
f653 = record.fields.select { |field| field.tag == '653' } |
|
f653.each do |field| |
|
original_653 = field['a'] |
|
normalized = normalize_heading_for_local_search(heading:field['a']) |
|
authority_match = authority_hash[normalized] |
|
|
|
if authority_match && !unwanted_terms.include?(normalized) |
|
match = true |
|
tag = authority_match[:tag].dup |
|
tag.gsub!(/1(..)/, '6\1') |
|
indicator1 = authority_match[:field].indicator1 |
|
new_field = MARC::DataField.new(tag, indicator1, "0") |
|
authority_match[:field].subfields.each do |subfield| |
|
new_subfield = MARC::Subfield.new(subfield.code.dup, subfield.value.dup) |
|
new_field.append(new_subfield) |
|
end |
|
new_field.subfields[-1].value.gsub!(/^(.+)$/, '\1.') |
|
if new_field.tag == '655' |
|
new_field.indicator2 = "7" |
|
new_field.append(MARC::Subfield.new('2', 'lcgft')) |
|
end |
|
record.append(new_field) |
|
record.fields.delete(field) |
|
end |
|
output.puts "#{mmsid}\t#{original_653}\t#{new_field.to_s}" |
|
end |
|
writer.write(record) if match |
|
end |
|
|
|
writer.close |
|
output.close |
More about 008/15 value "b" of authority records:
![image](https://private-user-images.githubusercontent.com/54282275/330939217-082da02d-276f-4cb5-b093-405ccca0ce32.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MTkyMjk4NDcsIm5iZiI6MTcxOTIyOTU0NywicGF0aCI6Ii81NDI4MjI3NS8zMzA5MzkyMTctMDgyZGEwMmQtMjc2Zi00Y2I1LWIwOTMtNDA1Y2NjYTBjZTMyLnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDA2MjQlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwNjI0VDExNDU0N1omWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTQ3YTg2OTZiZjQ4MzIyYWY0ODE4NWJlZWYyMmM1MWE5ODE3NGQ1NWQyMGFiYjFiZWU0OWRlYjlhNWY4MGQ0OTcmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.6rIOXGcndF7AyD8B7eV-xKaCM4pxcIaGOJRcZ4TEtbo)
![image](https://private-user-images.githubusercontent.com/54282275/330939425-5131a63e-0f89-409e-9512-0f1c97fec755.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MTkyMjk4NDcsIm5iZiI6MTcxOTIyOTU0NywicGF0aCI6Ii81NDI4MjI3NS8zMzA5Mzk0MjUtNTEzMWE2M2UtMGY4OS00MDllLTk1MTItMGYxYzk3ZmVjNzU1LnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDA2MjQlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwNjI0VDExNDU0N1omWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTJjYzhmZmNmMGQxODVkMzgwZGUwY2JkMTljN2U3ZGI3YWQ3OTBjMjM0YTdkMDUzZTVmZTE0Y2NkZDViMzA0MzUmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.VxdMyXlyKMD6if6K6AHL7tSw6yhFyXynoAmhY6bxBDM)
![image](https://private-user-images.githubusercontent.com/54282275/330939628-0031f972-6431-4568-b031-b85fa100b4ed.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MTkyMjk4NDcsIm5iZiI6MTcxOTIyOTU0NywicGF0aCI6Ii81NDI4MjI3NS8zMzA5Mzk2MjgtMDAzMWY5NzItNjQzMS00NTY4LWIwMzEtYjg1ZmExMDBiNGVkLnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDA2MjQlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwNjI0VDExNDU0N1omWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPWJlMTUwY2I1MWI2ZmFiNDE5N2EwZDc4N2NlZTRkNDZjZjJkYjBiNzdlZjBkZWEwMzlmNmQwODExMmEzYTRiM2YmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.WP_tthUHlhwUgxPjNqiVKu8q8omQ536VdqnlVFBckBQ)