Last active
November 15, 2019 03:42
-
-
Save mark-dce/07f861e3472a1b5061df335996fd373b to your computer and use it in GitHub Desktop.
XML and CSV parsing examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
namespace :dce do | |
desc "Yellowback analysis sketchpad & junk drawer" | |
task analyze: :environment do | |
f = File.open('/Users/mark/Google Drive/DCE Clients/Emory University/2019-01 Repository Migration Development/5. Sample Data/DLP_Publishing_Test_20190919.xml') | |
doc = Nokogiri::XML(f) | |
total_records = doc.xpath("//record").count | |
child_volumes = doc.xpath("//datafield[@tag='856']/subfield[@code='3']/ancestor::record").count #code '3' = volume id, code 'u' = url | |
parent_mmsids = doc.xpath("//datafield[@tag='856']/subfield[@code='3']/ancestor::record").map{|n| n.xpath("controlfield[@tag='001']").text} | |
puts "MARC Records" | |
puts "Total Records:\t\t #{total_records}" | |
puts "Muli-volume works:\t #{child_volumes}" | |
puts "Multi-volume work IDs:\t #{parent_mmsids}" | |
byebug | |
shoulders = {} | |
# Show the MMSIDs that have more than one ARK | |
@mmsids = {} | |
# return a uniqe index each time an mmsid is seen | |
def mmsid_index(mmsid, row_num, table_num) | |
index = 1 | |
while @mmsids["#{mmsid}.#{index}"] | |
index += 1 | |
end | |
new_key = "#{mmsid}.#{index}" | |
@mmsids[new_key] = {original_row: row_num, internal_index: table_num} | |
return new_key | |
end | |
pl = CSV.read('/Users/mark/Google Drive/DCE Clients/Emory University/2019-01 Repository Migration Development/5. Sample Data/yellowbacks_pull_list_Curate.csv', headers: true) | |
# with_vols = pl.map.with_index{ |r,i| [i,r['Call Number/MSS Number (if applicable)'],r['ALMA MMSID']] if r["Call Number/MSS Number (if applicable)"].include?('V.')} | |
# multi_vols = with_vols.select{|r| !r.nil?} | |
# vol_mmsids = multi_vols.map{|r| r[2]} | |
proto_csv = pl.map.with_index do |row, index| | |
if row['ALMA MMSID'] # disregard blank rows in the CSV | |
row_num = index+2 # gives row_num in origianl csv, accounting for header row and ruby index starting at 0 vs. spreadsheet starting at 1 | |
alma_mmsid = row['ALMA MMSID'] | |
call_no = row['CSV Call Number'] || '.0' | |
call_no_suffix = call_no[/(\.\d)$/] || '.0' | |
puts call_no_suffix | |
title = row['CSV Title'] | |
key = mmsid_index(alma_mmsid, row_num, index) | |
[key, row_num, alma_mmsid, call_no, call_no_suffix, title ] | |
end | |
end | |
byebug | |
ph = Hash[proto_csv[0..10].map{|row| [row[1], row]}] | |
a = pl.map{|r| r['ALMA MMSID']} | |
a.group_by(&:itself).map{|k, v| [k, v.length]}.select{|r| r[1]>1} | |
multi_mmsids = a.group_by(&:itself).map{|k, v| [k, v.length]}.select{|r| r[1]>1}.map{|r| r[0]}.sort | |
puts "Pull List Records" | |
puts "Total Records:\t\t #{pl.count}" | |
puts "Muli-volume works:\t #{multi_mmsids.count}" | |
puts "Multi-volume work IDs:\t #{multi_mmsids}" | |
# Find the MMSIDs that appear more than once in the Pull List | |
# Make a list of MMSIDS that have multiple occurences | |
# Show the volume numbers in Alma | |
# Show the call numbers in Pull List | |
# Search Alma for records with multiple ARK nodes | |
# multi_arks = doc.xpath("//record[count(datafield[@tag='856'])>1]") | |
# - capture the ID, volumeID, ark | |
# - ID doc.xpath("//record[count(datafield[@tag='856'])>1][1]/controlfield[@tag='001']").text | |
# - Title doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='245']/child::*").text | |
# - VolumeID doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='856'][1]/subfield[@code='3']").text | |
# - ARK doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='856'][1]/subfield[@code='u']").text | |
puts "%-18s | %-8s | %-30s | %-50s " % ["alma_mmsid","vol_id","vol_ark","title"] | |
marc_with_multi_vols = doc.xpath("//record[count(datafield[@tag='856'])>1]/datafield[@tag='856']") | |
marc_with_multi_vols.each do |marc_record| | |
alma_mmsid = marc_record.xpath("ancestor::record/controlfield[@tag='001']").text | |
title = marc_record.xpath("ancestor::record/datafield[@tag='245']/child::*").text | |
volume_id = marc_record.xpath("subfield[@code='3']").text | |
volume_ark = marc_record.xpath("subfield[@code='u']").text | |
puts "%-18s | %-8s | %-30s | %-50s " % [alma_mmsid,volume_id,volume_ark,title] | |
# save these value instead of printing them... | |
end | |
byebug | |
###### | |
# | |
# plm = List of all MMSIDs from Pull List that occurr more than once | |
# plm = a.group_by(&:itself).map{|k, v| [k, v.length]}.select{|r| r[1]>1}.map{|r| r[0]}.uniq.sort | |
# alm = List of all MMSIDs from Marc Export that have more than one ark | |
# alm = marc_with_multi_vols.map{|node| node.xpath("ancestor::record/controlfield[@tag='001']").text }.sort.uniq | |
# union = (alm+plm).uniq.sort | |
# no_ark = union-alm | |
# no_pl = union-plm | |
# record in pull list | |
# matches record in alma | |
# doesn't have corresponding alma record | |
# doesn't have corresponding ark | |
# has more orks than expected | |
# record in alma | |
# matches record in pull list | |
# Read each line in the Pull List | |
# extact the Row #, MMSID, Call No., Call No. suffix. Title | |
# | |
end | |
end | |
# Multiple arks in Alma, only 1 reference in Pull List | |
# 990019157630302486 - Jonathan and his continent. (Rambles through American society) By Max O'Rell ... and Jack Allyn. Translated by Madame Paul Blouët. | |
# 990019374560302486 - With harp and crown / a novel by Walter Besant and James Rice. | |
# etc. | |
# | |
# mmsid - points to one or more indexed row records, | |
# each indexed row corresponds to one line of the CSV | |
# Acceptance | |
# - [ ] 1 pull list row and 1 marc record with a single ark with matching mmsid, merge them into a single row | |
# - [ ] 1 plull list row and no matching marc record, log an error | |
# - [ ] 2 pull list rows with the same mmsid and a matching marc record with 2 matching volume ids, merge them into two separate rows | |
# - [ ] 2 pull list rows with the same mmsid and 1 matching ark record, merge the first row, log an error for the second row | |
# - [ ] more than 2 rows similar to 2 rows |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment