Created
February 1, 2024 21:11
-
-
Save Beck-Davis/f7d8f42f6b95e1c0dddb2eb30ef0b47e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require_relative './../lib/marc_cleanup' | |
ISBN13PREFIX = '978'.freeze | |
def contains_020error_indicator?(field) #regular expression looking for specified phrase | |
field['c'] =~ /Invalid data in 1st \$a in .* 020/ | |
end | |
def field_020_error?(record) #check to see if 020$a has non numerical or alphabetical characters | |
record.fields('020').any? { |field| (field['a'].to_s =~ /^\s*([^\s]+)\s+(\(.*?\))\s*$/) } #regex, block returns true if there are any characters other than 0-9 or X | |
end | |
def new_020_q(record) #remove the non numerical strings and append a new 020$q with the string | |
record.fields('020').each do |f020| | |
f020.subfields.each do |subfield| | |
next unless subfield.code == 'a' | |
isbn_parts = /^\s*([\d\-]+)\s*(\(.*?\))\s*$/.match(subfield.value) | |
next if isbn_parts.nil? | |
subfield.value = isbn_parts[1] | |
f020.append(MARC::Subfield.new('q', isbn_parts[2])) #appends the matching (substring) to a new subfield q | |
end | |
end | |
record | |
end | |
### Convert ISBN-10 to ISBN-13 | |
def isbn10_to_13(isbn) | |
stem = isbn[0..8] | |
return nil if stem =~ /\D/ | |
existing_check = isbn[9] | |
return nil if existing_check && existing_check != checkdigit_10(stem) | |
main = ISBN13PREFIX + stem | |
checkdigit = checkdigit_13(main) | |
main + checkdigit | |
end | |
### Calculate check digit for ISBN-10 | |
def checkdigit_10(stem) | |
int_index = 0 | |
int_sum = 0 | |
stem.each_char do |digit| | |
int_sum += digit.to_i * (10 - int_index) | |
int_index += 1 | |
end | |
mod = (11 - (int_sum % 11)) % 11 | |
mod == 10 ? 'X' : mod.to_s | |
end | |
### Calculate check digit for ISBN-13 | |
def checkdigit_13(stem) | |
int_index = 0 | |
int_sum = 0 | |
stem.each_char do |digit| | |
int_sum += int_index.even? ? digit.to_i : digit.to_i * 3 | |
int_index += 1 | |
end | |
((10 - (int_sum % 10)) % 10).to_s | |
end | |
### Normalize ISBN-13 | |
def isbn13_normalize(raw_isbn) | |
int_sum = 0 | |
stem = raw_isbn[0..11] | |
return nil if stem =~ /\D/ | |
int_index = 0 | |
stem.each_char do |digit| | |
int_sum += int_index.even? ? digit.to_i : digit.to_i * 3 | |
int_index += 1 | |
end | |
checkdigit = checkdigit_13(stem) | |
return nil if raw_isbn[12] && raw_isbn[12] != checkdigit | |
stem + checkdigit | |
end | |
### Normalize any given string that is supposed to include an ISBN | |
def isbn_normalize(isbn) | |
return nil unless isbn | |
raw_isbn = isbn.dup | |
raw_isbn.delete!('-') | |
raw_isbn.delete!('\\') | |
raw_isbn.gsub!(/\([^\)]*\)/, '') | |
raw_isbn.gsub!(/^(.*)\$c.*$/, '\1') | |
raw_isbn.gsub!(/^(.*)\$q.*$/, '\1') | |
raw_isbn.gsub!(/^\D+([0-9].*)$/, '\1') | |
if raw_isbn =~ /^978/ | |
raw_isbn.gsub!(/^(978[0-9 ]+).*$/, '\1') | |
raw_isbn.delete!(' ') | |
else | |
raw_isbn.gsub!(/([0-9])\s*([0-9]{4})\s*([0-9]{4})\s*([0-9xX]).*$/, '\1\2\3\4') | |
end | |
raw_isbn.gsub!(/^([0-9]{9,13}[xX]?)[^0-9xX].*$/, '\1') | |
raw_isbn.gsub!(/^([0-9]+?)\D.*$/, '\1') | |
if raw_isbn.length > 6 && raw_isbn.length < 9 && raw_isbn =~ /^[0-9]+$/ | |
raw_isbn = raw_isbn.ljust(9, '0') | |
end | |
valid_lengths = [9, 10, 12, 13] # ISBN10 and ISBN13 with/out check digits | |
return nil unless valid_lengths.include? raw_isbn.length | |
if raw_isbn.length < 12 | |
isbn10_to_13(raw_isbn) | |
else | |
isbn13_normalize(raw_isbn) | |
end | |
end | |
def move_invalid_isbn(record) | |
record.fields('020').each do |f020| | |
f020.subfields.each do |subfield| | |
next unless subfield.code == 'a' | |
isbn = subfield.value | |
normalized_isbn = isbn_normalize(isbn) | |
if normalized_isbn | |
subfield.value = normalized_isbn | |
else | |
subfield.code = 'z' | |
end | |
end | |
end | |
record | |
end | |
def cleanup_020(record) | |
record = new_020_q(record) | |
record = move_invalid_isbn(record) | |
record.fields.delete_if { |f| f.tag == '915' && contains_020error_indicator?(f) } | |
record.fields.delete_if { |f| f.tag == '914' && record['915'].nil? } | |
record | |
end | |
writer = MARC::XMLWriter.new('filepath') | |
Dir.glob('filepath').each do |file| | |
puts File.basename(file) | |
reader = MARC::XMLReader.new(file) | |
reader.each do |record| | |
originalrecord = record.to_s.dup | |
cleanup_020(record) | |
writer.write(record) if originalrecord != record.to_s | |
end | |
end | |
writer.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment