|
require 'csv' |
|
require 'marc' |
|
|
|
### Normalization methods |
|
|
|
def get_heading_from_authority_field(field) |
|
heading_string = '' |
|
field.subfields.each do |subfield| |
|
next if subfield.value.nil? |
|
heading_string += ' ' if field.subfields.index(subfield) > 0 |
|
heading_string += subfield.value |
|
end |
|
heading_string.strip |
|
end |
|
|
|
def normalize_heading_for_local_search(heading:, normal_form: :nfd) |
|
val = heading.dup |
|
val.scrub!('') |
|
val.unicode_normalize!(normal_form) |
|
val.gsub!(/[\u0300-\u036f\[\]]/, '') |
|
val.downcase! |
|
val.gsub!(/[[:punct:][:space:]]/, '') |
|
val[0..1023] |
|
end |
|
|
|
### Create a hash where the normalized form is the key and the value is a hash with the following elements |
|
### element 1: 1xx tag from the authority record |
|
### element 2: 1xx field from the authority record |
|
|
|
authority_hash = {} |
|
|
|
Dir.glob('lcgft authority filepath').each do |file| |
|
reader = MARC::Reader.new(file) |
|
reader.each do |record| |
|
field = record.fields.select { |field| field.tag[0] == '1' }.first |
|
heading = get_heading_from_authority_field(field) |
|
normalized = normalize_heading_for_local_search(heading: heading) |
|
authority_id = record['010']['a'] |
|
authority_hash[normalized] = {} |
|
authority_hash[normalized][:tag] = field.tag |
|
authority_hash[normalized][:field] = field |
|
end |
|
end |
|
|
|
Dir.glob('lcnames authority filepath').each do |file| |
|
reader = MARC::Reader.new(file) |
|
reader.each do |record| |
|
field = record.fields.select { |field| field.tag[0] == '1' }.first |
|
next if record['008'].value[9] == 'd' |
|
|
|
heading = get_heading_from_authority_field(field) |
|
normalized = normalize_heading_for_local_search(heading: heading) |
|
authority_id = record['010']['a'] |
|
authority_hash[normalized] = {} |
|
authority_hash[normalized][:tag] = field.tag |
|
authority_hash[normalized][:field] = field |
|
end |
|
end |
|
|
|
Dir.glob('lcsh authority filepath').each do |file| |
|
reader = MARC::Reader.new(file) |
|
reader.each do |record| |
|
field = record.fields.select { |field| field.tag[0] == '1' }.first |
|
next if record['008'].value[9] == 'd' |
|
|
|
heading = get_heading_from_authority_field(field) |
|
normalized = normalize_heading_for_local_search(heading: heading) |
|
authority_id = record['010']['a'] |
|
authority_hash[normalized] = {} |
|
authority_hash[normalized][:tag] = field.tag |
|
authority_hash[normalized][:field] = field |
|
end |
|
end |
|
|
|
unwanted_terms = [] |
|
input = File.open('filepath.txt', 'r') |
|
while line = input.gets |
|
line.chomp! |
|
normalized = normalize_heading_for_local_search(heading: line) |
|
unwanted_terms << normalized |
|
end |
|
unwanted_terms.uniq! |
|
|
|
# Iterate through each Cotsen MARC record. |
|
# For each 653, try to find the normalized form in the authority_hash. |
|
# If there is a match, replace the 653 field with the proper 6xx field from the authority record. |
|
# Write out a report with the changes and the retained fields for all records |
|
|
|
reader = MARC::XMLReader.new('path', parser:'magic', ignore_namespace: true) |
|
writer = MARC::XMLWriter.new('path') |
|
output = File.open('path', 'w') |
|
output.puts "MMS ID\tOriginal 653 Field\tNew 6xx Field" |
|
|
|
reader.each do |record| |
|
mmsid = record['001'].value |
|
match = false |
|
f653 = record.fields.select { |field| field.tag == '653' } |
|
f653.each do |field| |
|
original_653 = field['a'] |
|
normalized = normalize_heading_for_local_search(heading:field['a']) |
|
authority_match = authority_hash[normalized] |
|
|
|
if authority_match && !unwanted_terms.include?(normalized) |
|
match = true |
|
tag = authority_match[:tag].dup |
|
tag.gsub!(/1(..)/, '6\1') |
|
indicator1 = authority_match[:field].indicator1 |
|
new_field = MARC::DataField.new(tag, indicator1, "0") |
|
authority_match[:field].subfields.each do |subfield| |
|
new_subfield = MARC::Subfield.new(subfield.code.dup, subfield.value.dup) |
|
new_field.append(new_subfield) |
|
end |
|
new_field.subfields[-1].value.gsub!(/^(.+)$/, '\1.') |
|
if new_field.tag == '655' |
|
new_field.indicator2 = "7" |
|
new_field.append(MARC::Subfield.new('2', 'lcgft')) |
|
end |
|
record.append(new_field) |
|
record.fields.delete(field) |
|
end |
|
output.puts "#{mmsid}\t#{original_653}\t#{new_field.to_s}" |
|
end |
|
writer.write(record) if match |
|
end |
|
|
|
writer.close |
|
output.close |
Hi Beck, I have looked at Changed and Unchanged Cotsen 653.gsheet. Great work!
I have the following feedback for improvement and don't know how feasible they are--particularly no. 2. If not, they can be taken care of manually, but possibly not in the most efficient way. Here are some sample records that illustrate a few issues--
There are three sources of controlled vocabulary: LCGFT; LCNAF; LCSH.
One 653 text string can have more than one match, particularly in the case of format/genre terms. Could we adjust the order of matching as follows? --
The value of the fixed field 008/15 must be “a,” meaning the heading is appropriate for use as subject added entry
“b” means the heading is not valid for use as a subject
(See Authority record format; Subject Headings Manual H430 page 2)
I understand that the exclusion may be hard to implement. In that case, i will earmark them manually as much as possible.
If a matched heading ends with a period or a right parenthesis, then no extra dot needs to be added.