Skip to content

Instantly share code, notes, and snippets.

@mark-dce
Last active November 15, 2019 03:42
Show Gist options
  • Save mark-dce/07f861e3472a1b5061df335996fd373b to your computer and use it in GitHub Desktop.
Save mark-dce/07f861e3472a1b5061df335996fd373b to your computer and use it in GitHub Desktop.
XML and CSV parsing examples
# frozen_string_literal: true
namespace :dce do
desc "Yellowback analysis sketchpad & junk drawer"
task analyze: :environment do
f = File.open('/Users/mark/Google Drive/DCE Clients/Emory University/2019-01 Repository Migration Development/5. Sample Data/DLP_Publishing_Test_20190919.xml')
doc = Nokogiri::XML(f)
total_records = doc.xpath("//record").count
child_volumes = doc.xpath("//datafield[@tag='856']/subfield[@code='3']/ancestor::record").count #code '3' = volume id, code 'u' = url
parent_mmsids = doc.xpath("//datafield[@tag='856']/subfield[@code='3']/ancestor::record").map{|n| n.xpath("controlfield[@tag='001']").text}
puts "MARC Records"
puts "Total Records:\t\t #{total_records}"
puts "Muli-volume works:\t #{child_volumes}"
puts "Multi-volume work IDs:\t #{parent_mmsids}"
byebug
shoulders = {}
# Show the MMSIDs that have more than one ARK
@mmsids = {}
# return a uniqe index each time an mmsid is seen
def mmsid_index(mmsid, row_num, table_num)
index = 1
while @mmsids["#{mmsid}.#{index}"]
index += 1
end
new_key = "#{mmsid}.#{index}"
@mmsids[new_key] = {original_row: row_num, internal_index: table_num}
return new_key
end
pl = CSV.read('/Users/mark/Google Drive/DCE Clients/Emory University/2019-01 Repository Migration Development/5. Sample Data/yellowbacks_pull_list_Curate.csv', headers: true)
# with_vols = pl.map.with_index{ |r,i| [i,r['Call Number/MSS Number (if applicable)'],r['ALMA MMSID']] if r["Call Number/MSS Number (if applicable)"].include?('V.')}
# multi_vols = with_vols.select{|r| !r.nil?}
# vol_mmsids = multi_vols.map{|r| r[2]}
proto_csv = pl.map.with_index do |row, index|
if row['ALMA MMSID'] # disregard blank rows in the CSV
row_num = index+2 # gives row_num in origianl csv, accounting for header row and ruby index starting at 0 vs. spreadsheet starting at 1
alma_mmsid = row['ALMA MMSID']
call_no = row['CSV Call Number'] || '.0'
call_no_suffix = call_no[/(\.\d)$/] || '.0'
puts call_no_suffix
title = row['CSV Title']
key = mmsid_index(alma_mmsid, row_num, index)
[key, row_num, alma_mmsid, call_no, call_no_suffix, title ]
end
end
byebug
ph = Hash[proto_csv[0..10].map{|row| [row[1], row]}]
a = pl.map{|r| r['ALMA MMSID']}
a.group_by(&:itself).map{|k, v| [k, v.length]}.select{|r| r[1]>1}
multi_mmsids = a.group_by(&:itself).map{|k, v| [k, v.length]}.select{|r| r[1]>1}.map{|r| r[0]}.sort
puts "Pull List Records"
puts "Total Records:\t\t #{pl.count}"
puts "Muli-volume works:\t #{multi_mmsids.count}"
puts "Multi-volume work IDs:\t #{multi_mmsids}"
# Find the MMSIDs that appear more than once in the Pull List
# Make a list of MMSIDS that have multiple occurences
# Show the volume numbers in Alma
# Show the call numbers in Pull List
# Search Alma for records with multiple ARK nodes
# multi_arks = doc.xpath("//record[count(datafield[@tag='856'])>1]")
# - capture the ID, volumeID, ark
# - ID doc.xpath("//record[count(datafield[@tag='856'])>1][1]/controlfield[@tag='001']").text
# - Title doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='245']/child::*").text
# - VolumeID doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='856'][1]/subfield[@code='3']").text
# - ARK doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='856'][1]/subfield[@code='u']").text
puts "%-18s | %-8s | %-30s | %-50s " % ["alma_mmsid","vol_id","vol_ark","title"]
marc_with_multi_vols = doc.xpath("//record[count(datafield[@tag='856'])>1]/datafield[@tag='856']")
marc_with_multi_vols.each do |marc_record|
alma_mmsid = marc_record.xpath("ancestor::record/controlfield[@tag='001']").text
title = marc_record.xpath("ancestor::record/datafield[@tag='245']/child::*").text
volume_id = marc_record.xpath("subfield[@code='3']").text
volume_ark = marc_record.xpath("subfield[@code='u']").text
puts "%-18s | %-8s | %-30s | %-50s " % [alma_mmsid,volume_id,volume_ark,title]
# save these value instead of printing them...
end
byebug
######
#
# plm = List of all MMSIDs from Pull List that occurr more than once
# plm = a.group_by(&:itself).map{|k, v| [k, v.length]}.select{|r| r[1]>1}.map{|r| r[0]}.uniq.sort
# alm = List of all MMSIDs from Marc Export that have more than one ark
# alm = marc_with_multi_vols.map{|node| node.xpath("ancestor::record/controlfield[@tag='001']").text }.sort.uniq
# union = (alm+plm).uniq.sort
# no_ark = union-alm
# no_pl = union-plm
# record in pull list
# matches record in alma
# doesn't have corresponding alma record
# doesn't have corresponding ark
# has more orks than expected
# record in alma
# matches record in pull list
# Read each line in the Pull List
# extact the Row #, MMSID, Call No., Call No. suffix. Title
#
end
end
# Multiple arks in Alma, only 1 reference in Pull List
# 990019157630302486 - Jonathan and his continent. (Rambles through American society) By Max O'Rell ... and Jack Allyn. Translated by Madame Paul Blouët.
# 990019374560302486 - With harp and crown / a novel by Walter Besant and James Rice.
# etc.
#
# mmsid - points to one or more indexed row records,
# each indexed row corresponds to one line of the CSV
# Acceptance
# - [ ] 1 pull list row and 1 marc record with a single ark with matching mmsid, merge them into a single row
# - [ ] 1 plull list row and no matching marc record, log an error
# - [ ] 2 pull list rows with the same mmsid and a matching marc record with 2 matching volume ids, merge them into two separate rows
# - [ ] 2 pull list rows with the same mmsid and 1 matching ark record, merge the first row, log an error for the second row
# - [ ] more than 2 rows similar to 2 rows
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment