Beck-Davis/ISO639-3_546_exclusionary.rb

## ignore_546_phrasing.txt
abstract
abstaract
added
also
additional
accompanying
additional
acknowledgement
alphabet
afterword
appen
appedices
appendix
available
biography
biographical
bibliographical
bibliography
citation
chapter
caption
chiefly
contribute
contribution
contain
commentry
commentary
commentaries
cover
cyrillic or Roman
cyrillic and roman
data
dedication
dictionary
definitions
excerpt
each
essay
epigraph
foreword
forword
given
glossary
glossaries
incorporate
inscription
intertitle
introd
introduction
include
index
indexes
indices
interface
key
legend
list
(Latin)
map
narration
note
noted
original
originally
online
occassional
passage
place
postscript
pref
preface
prefatory
quote
quoted
quotation
reading mark
(Roman)
(roman)
some
synopsis
synopses
summary
summaries
sumarries
subtitle
script
section
selected
search
translated
translation
translate
transliterated
title
table
with

## ISO639-3_546_exclusionary.rb
###This code is most recent for ISO 639-3 project and has been the most effective
###Uses a text list of terms that might appear in the field 546 that will
###make a record ineligible for new 041$a fields with ISO 639-3 codes

require_relative './../lib/marc_cleanup'
require 'nokogiri'
require 'set'
require 'pry'

name_to_code = {} #New hash containing languages as keys and ISO codes as values
input = File.open('/Users/davisr/Documents/marc/ISO 639-3/iso-639-3_Code_Tables_20230123/iso-639-3_20230123.tab', "r")
input.gets
while line = input.gets
  line.chomp!
  columns = line.split("\t")
  language = columns[6]
  code = columns[0]
  name_to_code[language] = code
end

writer = MARC::XMLWriter.new('filepath')

ignore_546_phrasing = Array.new #New Array containing terms within 546 field that should be ignored
input = File.open('/Users/davisr/git/marc_cleanup/tasks/ignore_546_phrasing.txt', 'r')
while line = input.gets
  line.chomp!
  ignore_546_phrasing << line
end

def new_041a_iso(reader, name_to_code, writer, ignore_546_phrasing)
  reader.each do |record|
    f041 = record.fields('041').select { |field| field.indicator2 == '7' && field['2'] == 'iso639-3' }
    next unless f041.empty?

    new_field = MARC::DataField.new('041', ' ', '7')
    noncandidate = false
    processed_languages = Set.new
    skip_record = false

    record.fields('546').each do |field| #iterate over the 546 field
      field.subfields.each do |subfield| #iterate over the 546 subfields
        ignore_chars = '\s\.\;\,' #characters to be ignored within the 546 subfields for the regex
        noncandidate_regex = /#{ignore_546_phrasing.join('|')}\b/i #regex calling in the ignore_546_phrasing text file to look anywhere in the subfield value string
        noncandidate = noncandidate_regex.match?(subfield.value)
        if noncandidate #breaks the loop from the record and goes to the next
          skip_record = true
          break
        end

        name_to_code.each do |language, iso_code| # iterates over the iso codes and languages
          if subfield.value =~ /#{language}[\s\.\;\,]/ && !processed_languages.include?(iso_code) #regex to match language names within the subfield value string
            new_field.append(MARC::Subfield.new('a', "#{iso_code}"))
            processed_languages.add(iso_code)
          end
        end
      end
      break if skip_record #exit the loop if the entire record should be skipped
    end

    if !skip_record && new_field['a']
      new_field.append(MARC::Subfield.new('2', 'iso639-3'))
      record.append(new_field)
      writer.write(record)
    end
  end
end

Dir.glob('filepath').each do |file|
  puts File.basename(file)
  reader = MARC::XMLReader.new(file, parser: 'magic',  ignore_namespace: true)
  new_041a_iso(reader, name_to_code, writer, ignore_546_phrasing)
end
writer.close
	abstract
	abstaract
	added
	also
	additional
	accompanying
	additional
	acknowledgement
	alphabet
	afterword
	appen
	appedices
	appendix
	available
	biography
	biographical
	bibliographical
	bibliography
	citation
	chapter
	caption
	chiefly
	contribute
	contribution
	contain
	commentry
	commentary
	commentaries
	cover
	cyrillic or Roman
	cyrillic and roman
	data
	dedication
	dictionary
	definitions
	excerpt
	each
	essay
	epigraph
	foreword
	forword
	given
	glossary
	glossaries
	incorporate
	inscription
	intertitle
	introd
	introduction
	include
	index
	indexes
	indices
	interface
	key
	legend
	list
	(Latin)
	map
	narration
	note
	noted
	original
	originally
	online
	occassional
	passage
	place
	postscript
	pref
	preface
	prefatory
	quote
	quoted
	quotation
	reading mark
	(Roman)
	(roman)
	some
	synopsis
	synopses
	summary
	summaries
	sumarries
	subtitle
	script
	section
	selected
	search
	translated
	translation
	translate
	transliterated
	title
	table
	with
	###This code is most recent for ISO 639-3 project and has been the most effective
	###Uses a text list of terms that might appear in the field 546 that will
	###make a record ineligible for new 041$a fields with ISO 639-3 codes

	require_relative './../lib/marc_cleanup'
	require 'nokogiri'
	require 'set'
	require 'pry'

	name_to_code = {} #New hash containing languages as keys and ISO codes as values
	input = File.open('/Users/davisr/Documents/marc/ISO 639-3/iso-639-3_Code_Tables_20230123/iso-639-3_20230123.tab', "r")
	input.gets
	while line = input.gets
	line.chomp!
	columns = line.split("\t")
	language = columns[6]
	code = columns[0]
	name_to_code[language] = code
	end

	writer = MARC::XMLWriter.new('filepath')

	ignore_546_phrasing = Array.new #New Array containing terms within 546 field that should be ignored
	input = File.open('/Users/davisr/git/marc_cleanup/tasks/ignore_546_phrasing.txt', 'r')
	while line = input.gets
	line.chomp!
	ignore_546_phrasing << line
	end

	def new_041a_iso(reader, name_to_code, writer, ignore_546_phrasing)
	reader.each do \|record\|
	f041 = record.fields('041').select { \|field\| field.indicator2 == '7' && field['2'] == 'iso639-3' }
	next unless f041.empty?

	new_field = MARC::DataField.new('041', ' ', '7')
	noncandidate = false
	processed_languages = Set.new
	skip_record = false

	record.fields('546').each do \|field\| #iterate over the 546 field
	field.subfields.each do \|subfield\| #iterate over the 546 subfields
	ignore_chars = '\s\.\;\,' #characters to be ignored within the 546 subfields for the regex
	noncandidate_regex = /#{ignore_546_phrasing.join('\|')}\b/i #regex calling in the ignore_546_phrasing text file to look anywhere in the subfield value string
	noncandidate = noncandidate_regex.match?(subfield.value)
	if noncandidate #breaks the loop from the record and goes to the next
	skip_record = true
	break
	end

	name_to_code.each do \|language, iso_code\| # iterates over the iso codes and languages
	if subfield.value =~ /#{language}[\s\.\;\,]/ && !processed_languages.include?(iso_code) #regex to match language names within the subfield value string
	new_field.append(MARC::Subfield.new('a', "#{iso_code}"))
	processed_languages.add(iso_code)
	end
	end
	end
	break if skip_record #exit the loop if the entire record should be skipped
	end

	if !skip_record && new_field['a']
	new_field.append(MARC::Subfield.new('2', 'iso639-3'))
	record.append(new_field)
	writer.write(record)
	end
	end
	end

	Dir.glob('filepath').each do \|file\|
	puts File.basename(file)
	reader = MARC::XMLReader.new(file, parser: 'magic', ignore_namespace: true)
	new_041a_iso(reader, name_to_code, writer, ignore_546_phrasing)
	end
	writer.close