Created
February 1, 2024 21:19
-
-
Save Beck-Davis/b062506ddf71617e35017a16cd082a4e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
abstract | |
abstaract | |
added | |
also | |
additional | |
accompanying | |
additional | |
acknowledgement | |
alphabet | |
afterword | |
appen | |
appedices | |
appendix | |
available | |
biography | |
biographical | |
bibliographical | |
bibliography | |
citation | |
chapter | |
caption | |
chiefly | |
contribute | |
contribution | |
contain | |
commentry | |
commentary | |
commentaries | |
cover | |
cyrillic or Roman | |
cyrillic and roman | |
data | |
dedication | |
dictionary | |
definitions | |
excerpt | |
each | |
essay | |
epigraph | |
foreword | |
forword | |
given | |
glossary | |
glossaries | |
incorporate | |
inscription | |
intertitle | |
introd | |
introduction | |
include | |
index | |
indexes | |
indices | |
interface | |
key | |
legend | |
list | |
(Latin) | |
map | |
narration | |
note | |
noted | |
original | |
originally | |
online | |
occassional | |
passage | |
place | |
postscript | |
pref | |
preface | |
prefatory | |
quote | |
quoted | |
quotation | |
reading mark | |
(Roman) | |
(roman) | |
some | |
synopsis | |
synopses | |
summary | |
summaries | |
sumarries | |
subtitle | |
script | |
section | |
selected | |
search | |
translated | |
translation | |
translate | |
transliterated | |
title | |
table | |
with |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###This code is most recent for ISO 639-3 project and has been the most effective | |
###Uses a text list of terms that might appear in the field 546 that will | |
###make a record ineligible for new 041$a fields with ISO 639-3 codes | |
require_relative './../lib/marc_cleanup' | |
require 'nokogiri' | |
require 'set' | |
require 'pry' | |
name_to_code = {} #New hash containing languages as keys and ISO codes as values | |
input = File.open('/Users/davisr/Documents/marc/ISO 639-3/iso-639-3_Code_Tables_20230123/iso-639-3_20230123.tab', "r") | |
input.gets | |
while line = input.gets | |
line.chomp! | |
columns = line.split("\t") | |
language = columns[6] | |
code = columns[0] | |
name_to_code[language] = code | |
end | |
writer = MARC::XMLWriter.new('filepath') | |
ignore_546_phrasing = Array.new #New Array containing terms within 546 field that should be ignored | |
input = File.open('/Users/davisr/git/marc_cleanup/tasks/ignore_546_phrasing.txt', 'r') | |
while line = input.gets | |
line.chomp! | |
ignore_546_phrasing << line | |
end | |
def new_041a_iso(reader, name_to_code, writer, ignore_546_phrasing) | |
reader.each do |record| | |
f041 = record.fields('041').select { |field| field.indicator2 == '7' && field['2'] == 'iso639-3' } | |
next unless f041.empty? | |
new_field = MARC::DataField.new('041', ' ', '7') | |
noncandidate = false | |
processed_languages = Set.new | |
skip_record = false | |
record.fields('546').each do |field| #iterate over the 546 field | |
field.subfields.each do |subfield| #iterate over the 546 subfields | |
ignore_chars = '\s\.\;\,' #characters to be ignored within the 546 subfields for the regex | |
noncandidate_regex = /#{ignore_546_phrasing.join('|')}\b/i #regex calling in the ignore_546_phrasing text file to look anywhere in the subfield value string | |
noncandidate = noncandidate_regex.match?(subfield.value) | |
if noncandidate #breaks the loop from the record and goes to the next | |
skip_record = true | |
break | |
end | |
name_to_code.each do |language, iso_code| # iterates over the iso codes and languages | |
if subfield.value =~ /#{language}[\s\.\;\,]/ && !processed_languages.include?(iso_code) #regex to match language names within the subfield value string | |
new_field.append(MARC::Subfield.new('a', "#{iso_code}")) | |
processed_languages.add(iso_code) | |
end | |
end | |
end | |
break if skip_record #exit the loop if the entire record should be skipped | |
end | |
if !skip_record && new_field['a'] | |
new_field.append(MARC::Subfield.new('2', 'iso639-3')) | |
record.append(new_field) | |
writer.write(record) | |
end | |
end | |
end | |
Dir.glob('filepath').each do |file| | |
puts File.basename(file) | |
reader = MARC::XMLReader.new(file, parser: 'magic', ignore_namespace: true) | |
new_041a_iso(reader, name_to_code, writer, ignore_546_phrasing) | |
end | |
writer.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment