Beck-Davis/653_lc_terms_matching.md

## 653_lc_terms_matching.md

      
    Raw
  

              653_lc_terms_matching.md
            
          
    These scripts are intended to find exact matches to determine if LC terms can be added to new 6xx fields programmatically. The codes use adapted methods from pulibrary/authority_control.  The methods strip subject headings in LC authority MARC records and 653 fields from bibliographic MARC records down to simple strings. It also compares the strings for exact matches.
The methods get_heading_from_authority_field and normalize_heading_for_local_search borrowed from pulibrary/authority_control.
Hash authority_hash is created with the normalized forms of the headings from the authority records.
Array unwanted_terms is created with a txt list of terms in the 653 fields that should be ignored during matching
While iterating over the 653s in each record, the original field value is stored to be written out in a report.
The 653 is normalized using the normalize_heading_for_local_search method.
If the normalized 653 is matched in the authority_match, and is not in the unwanted_terms, a new 6xx field is constructed from the 1xx field of the authority record.
If the new 6xx field is a 655, the second indicator is changed to '7' and a subfield $2 is appended.
The new 6xx field is appended to the bibliographic record, the 653 is deleted.
If there is no match between the normalized 653 and the normalized lc heading, the 653 is retained.
The changes and retained 653 fields are written to a report.

  
## 653_lcterms_matching.rb
require 'csv'
require 'marc'

### Normalization methods

def get_heading_from_authority_field(field)
  heading_string = ''
  field.subfields.each do |subfield|
    next if subfield.value.nil?
    heading_string += ' ' if field.subfields.index(subfield) > 0
    heading_string += subfield.value
  end
  heading_string.strip
end

def normalize_heading_for_local_search(heading:, normal_form: :nfd)
  val = heading.dup
  val.scrub!('')
  val.unicode_normalize!(normal_form)
  val.gsub!(/[\u0300-\u036f\[\]]/, '')
  val.downcase!
  val.gsub!(/[[:punct:][:space:]]/, '')
  val[0..1023]
end

### Create a hash where the normalized form is the key and the value is a hash with the following elements
### element 1: 1xx tag from the authority record
### element 2: 1xx field from the authority record

authority_hash = {}

Dir.glob('lcgft authority filepath').each do |file|
  reader = MARC::Reader.new(file)
  reader.each do |record|
    field = record.fields.select { |field| field.tag[0] == '1' }.first
    heading = get_heading_from_authority_field(field)
    normalized = normalize_heading_for_local_search(heading: heading)
    authority_id = record['010']['a']
    authority_hash[normalized] = {}
    authority_hash[normalized][:tag] = field.tag
    authority_hash[normalized][:field] = field
  end
end

Dir.glob('lcnames authority filepath').each do |file|
  reader = MARC::Reader.new(file)
  reader.each do |record|
    field = record.fields.select { |field| field.tag[0] == '1' }.first
    next if record['008'].value[9] == 'd'

    heading = get_heading_from_authority_field(field)
    normalized = normalize_heading_for_local_search(heading: heading)
    authority_id = record['010']['a']
    authority_hash[normalized] = {}
    authority_hash[normalized][:tag] = field.tag
    authority_hash[normalized][:field] = field
  end
end

Dir.glob('lcsh authority filepath').each do |file|
  reader = MARC::Reader.new(file)
  reader.each do |record|
    field = record.fields.select { |field| field.tag[0] == '1' }.first
    next if record['008'].value[9] == 'd'

    heading = get_heading_from_authority_field(field)
    normalized = normalize_heading_for_local_search(heading: heading)
    authority_id = record['010']['a']
    authority_hash[normalized] = {}
    authority_hash[normalized][:tag] = field.tag
    authority_hash[normalized][:field] = field
  end
end

unwanted_terms = []
input = File.open('filepath.txt', 'r')
while line = input.gets
  line.chomp!
  normalized = normalize_heading_for_local_search(heading: line)
  unwanted_terms << normalized
end
unwanted_terms.uniq!

# Iterate through each Cotsen MARC record.
# For each 653, try to find the normalized form in the authority_hash.
# If there is a match, replace the 653 field with the proper 6xx field from the authority record.
# Write out a report with the changes and the retained fields for all records

reader = MARC::XMLReader.new('path', parser:'magic', ignore_namespace: true)
writer = MARC::XMLWriter.new('path')
output = File.open('path', 'w')
output.puts "MMS ID\tOriginal 653 Field\tNew 6xx Field"

reader.each do |record|
  mmsid = record['001'].value
  match = false
  f653 = record.fields.select { |field| field.tag == '653' }
  f653.each do |field|
    original_653 = field['a']
    normalized = normalize_heading_for_local_search(heading:field['a'])
    authority_match = authority_hash[normalized]

    if authority_match && !unwanted_terms.include?(normalized)
      match = true
      tag = authority_match[:tag].dup
      tag.gsub!(/1(..)/, '6\1')
      indicator1 = authority_match[:field].indicator1
      new_field = MARC::DataField.new(tag, indicator1, "0")
      authority_match[:field].subfields.each do |subfield|
        new_subfield = MARC::Subfield.new(subfield.code.dup, subfield.value.dup)
        new_field.append(new_subfield)
      end
      new_field.subfields[-1].value.gsub!(/^(.+)$/, '\1.')
      if new_field.tag == '655'
        new_field.indicator2 = "7"
        new_field.append(MARC::Subfield.new('2', 'lcgft'))
      end
      record.append(new_field)
      record.fields.delete(field)
    end
    output.puts "#{mmsid}\t#{original_653}\t#{new_field.to_s}"
  end
  writer.write(record) if match
end

writer.close
output.close
	require 'csv'
	require 'marc'

	### Normalization methods

	def get_heading_from_authority_field(field)
	heading_string = ''
	field.subfields.each do \|subfield\|
	next if subfield.value.nil?
	heading_string += ' ' if field.subfields.index(subfield) > 0
	heading_string += subfield.value
	end
	heading_string.strip
	end

	def normalize_heading_for_local_search(heading:, normal_form: :nfd)
	val = heading.dup
	val.scrub!('')
	val.unicode_normalize!(normal_form)
	val.gsub!(/[\u0300-\u036f\[\]]/, '')
	val.downcase!
	val.gsub!(/[[:punct:][:space:]]/, '')
	val[0..1023]
	end

	### Create a hash where the normalized form is the key and the value is a hash with the following elements
	### element 1: 1xx tag from the authority record
	### element 2: 1xx field from the authority record

	authority_hash = {}

	Dir.glob('lcgft authority filepath').each do \|file\|
	reader = MARC::Reader.new(file)
	reader.each do \|record\|
	field = record.fields.select { \|field\| field.tag[0] == '1' }.first
	heading = get_heading_from_authority_field(field)
	normalized = normalize_heading_for_local_search(heading: heading)
	authority_id = record['010']['a']
	authority_hash[normalized] = {}
	authority_hash[normalized][:tag] = field.tag
	authority_hash[normalized][:field] = field
	end
	end

	Dir.glob('lcnames authority filepath').each do \|file\|
	reader = MARC::Reader.new(file)
	reader.each do \|record\|
	field = record.fields.select { \|field\| field.tag[0] == '1' }.first
	next if record['008'].value[9] == 'd'

	heading = get_heading_from_authority_field(field)
	normalized = normalize_heading_for_local_search(heading: heading)
	authority_id = record['010']['a']
	authority_hash[normalized] = {}
	authority_hash[normalized][:tag] = field.tag
	authority_hash[normalized][:field] = field
	end
	end

	Dir.glob('lcsh authority filepath').each do \|file\|
	reader = MARC::Reader.new(file)
	reader.each do \|record\|
	field = record.fields.select { \|field\| field.tag[0] == '1' }.first
	next if record['008'].value[9] == 'd'

	heading = get_heading_from_authority_field(field)
	normalized = normalize_heading_for_local_search(heading: heading)
	authority_id = record['010']['a']
	authority_hash[normalized] = {}
	authority_hash[normalized][:tag] = field.tag
	authority_hash[normalized][:field] = field
	end
	end

	unwanted_terms = []
	input = File.open('filepath.txt', 'r')
	while line = input.gets
	line.chomp!
	normalized = normalize_heading_for_local_search(heading: line)
	unwanted_terms << normalized
	end
	unwanted_terms.uniq!

	# Iterate through each Cotsen MARC record.
	# For each 653, try to find the normalized form in the authority_hash.
	# If there is a match, replace the 653 field with the proper 6xx field from the authority record.
	# Write out a report with the changes and the retained fields for all records

	reader = MARC::XMLReader.new('path', parser:'magic', ignore_namespace: true)
	writer = MARC::XMLWriter.new('path')
	output = File.open('path', 'w')
	output.puts "MMS ID\tOriginal 653 Field\tNew 6xx Field"

	reader.each do \|record\|
	mmsid = record['001'].value
	match = false
	f653 = record.fields.select { \|field\| field.tag == '653' }
	f653.each do \|field\|
	original_653 = field['a']
	normalized = normalize_heading_for_local_search(heading:field['a'])
	authority_match = authority_hash[normalized]

	if authority_match && !unwanted_terms.include?(normalized)
	match = true
	tag = authority_match[:tag].dup
	tag.gsub!(/1(..)/, '6\1')
	indicator1 = authority_match[:field].indicator1
	new_field = MARC::DataField.new(tag, indicator1, "0")
	authority_match[:field].subfields.each do \|subfield\|
	new_subfield = MARC::Subfield.new(subfield.code.dup, subfield.value.dup)
	new_field.append(new_subfield)
	end
	new_field.subfields[-1].value.gsub!(/^(.+)$/, '\1.')
	if new_field.tag == '655'
	new_field.indicator2 = "7"
	new_field.append(MARC::Subfield.new('2', 'lcgft'))
	end
	record.append(new_field)
	record.fields.delete(field)
	end
	output.puts "#{mmsid}\t#{original_653}\t#{new_field.to_s}"
	end
	writer.write(record) if match
	end

	writer.close
	output.close