mark-dce/mark_analysis_junk_drawer.rake

## mark_analysis_junk_drawer.rake
# frozen_string_literal: true
namespace :dce do
  desc "Yellowback analysis sketchpad & junk drawer"
  task analyze: :environment do
    f = File.open('/Users/mark/Google Drive/DCE Clients/Emory University/2019-01 Repository Migration Development/5. Sample Data/DLP_Publishing_Test_20190919.xml')
    doc = Nokogiri::XML(f)
    total_records = doc.xpath("//record").count
    child_volumes = doc.xpath("//datafield[@tag='856']/subfield[@code='3']/ancestor::record").count #code '3' = volume id, code 'u' = url
    parent_mmsids = doc.xpath("//datafield[@tag='856']/subfield[@code='3']/ancestor::record").map{|n| n.xpath("controlfield[@tag='001']").text}
    puts "MARC Records"
    puts "Total Records:\t\t #{total_records}"
    puts "Muli-volume works:\t #{child_volumes}"
    puts "Multi-volume work IDs:\t #{parent_mmsids}"

    byebug

    shoulders = {}


    # Show the MMSIDs that have more than one ARK

    @mmsids = {}
    # return a uniqe index each time an mmsid is seen
    def mmsid_index(mmsid, row_num, table_num)
      index = 1
      while @mmsids["#{mmsid}.#{index}"]
        index += 1
      end
      new_key = "#{mmsid}.#{index}"
      @mmsids[new_key] = {original_row: row_num, internal_index: table_num}
      return new_key
    end


    pl = CSV.read('/Users/mark/Google Drive/DCE Clients/Emory University/2019-01 Repository Migration Development/5. Sample Data/yellowbacks_pull_list_Curate.csv', headers: true)
    # with_vols = pl.map.with_index{ |r,i| [i,r['Call Number/MSS Number (if applicable)'],r['ALMA MMSID']] if r["Call Number/MSS Number (if applicable)"].include?('V.')}
    # multi_vols = with_vols.select{|r| !r.nil?}
    # vol_mmsids = multi_vols.map{|r| r[2]}

    proto_csv = pl.map.with_index do |row, index|
      if row['ALMA MMSID'] # disregard blank rows in the CSV
        row_num = index+2 # gives row_num in origianl csv, accounting for header row and ruby index starting at 0 vs. spreadsheet starting at 1
        alma_mmsid = row['ALMA MMSID']
        call_no = row['CSV Call Number'] || '.0'
        call_no_suffix = call_no[/(\.\d)$/] || '.0'
        puts call_no_suffix
        title =  row['CSV Title']
        key = mmsid_index(alma_mmsid, row_num, index)
        [key, row_num, alma_mmsid, call_no, call_no_suffix, title ]
      end
    end
    byebug
        ph = Hash[proto_csv[0..10].map{|row| [row[1], row]}]


    a = pl.map{|r| r['ALMA MMSID']}
    a.group_by(&:itself).map{|k, v| [k, v.length]}.select{|r| r[1]>1}
    multi_mmsids = a.group_by(&:itself).map{|k, v| [k, v.length]}.select{|r| r[1]>1}.map{|r| r[0]}.sort
    puts "Pull List Records"
    puts "Total Records:\t\t #{pl.count}"
    puts "Muli-volume works:\t #{multi_mmsids.count}"
    puts "Multi-volume work IDs:\t #{multi_mmsids}"


    # Find the MMSIDs that appear more than once in the Pull List

    # Make a list of MMSIDS that have multiple occurences
    # Show the volume numbers in Alma
    # Show the call numbers in Pull List

    # Search Alma for records with multiple ARK nodes
    # multi_arks = doc.xpath("//record[count(datafield[@tag='856'])>1]")
    # -  capture the ID, volumeID, ark
    # -  ID doc.xpath("//record[count(datafield[@tag='856'])>1][1]/controlfield[@tag='001']").text
    # -  Title doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='245']/child::*").text
    # -  VolumeID doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='856'][1]/subfield[@code='3']").text
    # -  ARK doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='856'][1]/subfield[@code='u']").text

    puts "%-18s | %-8s | %-30s | %-50s " % ["alma_mmsid","vol_id","vol_ark","title"]
    marc_with_multi_vols = doc.xpath("//record[count(datafield[@tag='856'])>1]/datafield[@tag='856']")
    marc_with_multi_vols.each do |marc_record|
      alma_mmsid = marc_record.xpath("ancestor::record/controlfield[@tag='001']").text
      title = marc_record.xpath("ancestor::record/datafield[@tag='245']/child::*").text
      volume_id = marc_record.xpath("subfield[@code='3']").text
      volume_ark = marc_record.xpath("subfield[@code='u']").text
      puts "%-18s | %-8s | %-30s | %-50s " % [alma_mmsid,volume_id,volume_ark,title]
      # save these value instead of printing them...
    end

    byebug

    ######
    #
    # plm = List of all MMSIDs from Pull List that occurr more than once
    #   plm = a.group_by(&:itself).map{|k, v| [k, v.length]}.select{|r| r[1]>1}.map{|r| r[0]}.uniq.sort
    # alm = List of all MMSIDs from Marc Export that have more than one ark
    #    alm = marc_with_multi_vols.map{|node| node.xpath("ancestor::record/controlfield[@tag='001']").text }.sort.uniq
    # union = (alm+plm).uniq.sort
    # no_ark = union-alm
    # no_pl = union-plm

    # record in pull list
    #   matches record in alma
    #   doesn't have corresponding alma record
    #     doesn't have corresponding ark
    #     has more orks than expected
    # record in alma
    #   matches record in pull list

    # Read each line in the Pull List
    #   extact the Row #, MMSID, Call No., Call No. suffix. Title
    #


  end
end

# Multiple arks in Alma, only 1 reference in Pull List
# 990019157630302486 - Jonathan and his continent. (Rambles through American society) By Max O'Rell ... and Jack Allyn. Translated by Madame Paul Blouët.
# 990019374560302486 - With harp and crown / a novel by Walter Besant and James Rice.
# etc.
#

# mmsid - points to one or more indexed row records,
#  each indexed row corresponds to one line of the CSV


# Acceptance
# - [ ] 1 pull list row and 1 marc record with a single ark with matching mmsid, merge them into a single row
# - [ ] 1 plull list row and no matching marc record, log an error
# - [ ] 2 pull list rows with the same mmsid and a matching marc record with 2 matching volume ids, merge them into two separate rows
# - [ ] 2 pull list rows with the same mmsid and 1 matching ark record, merge the first row, log an error for the second row
# - [ ] more than 2 rows similar to 2 rows
	# frozen_string_literal: true
	namespace :dce do
	desc "Yellowback analysis sketchpad & junk drawer"
	task analyze: :environment do
	f = File.open('/Users/mark/Google Drive/DCE Clients/Emory University/2019-01 Repository Migration Development/5. Sample Data/DLP_Publishing_Test_20190919.xml')
	doc = Nokogiri::XML(f)
	total_records = doc.xpath("//record").count
	child_volumes = doc.xpath("//datafield[@tag='856']/subfield[@code='3']/ancestor::record").count #code '3' = volume id, code 'u' = url
	parent_mmsids = doc.xpath("//datafield[@tag='856']/subfield[@code='3']/ancestor::record").map{\|n\| n.xpath("controlfield[@tag='001']").text}
	puts "MARC Records"
	puts "Total Records:\t\t #{total_records}"
	puts "Muli-volume works:\t #{child_volumes}"
	puts "Multi-volume work IDs:\t #{parent_mmsids}"

	byebug

	shoulders = {}


	# Show the MMSIDs that have more than one ARK

	@mmsids = {}
	# return a uniqe index each time an mmsid is seen
	def mmsid_index(mmsid, row_num, table_num)
	index = 1
	while @mmsids["#{mmsid}.#{index}"]
	index += 1
	end
	new_key = "#{mmsid}.#{index}"
	@mmsids[new_key] = {original_row: row_num, internal_index: table_num}
	return new_key
	end



	pl = CSV.read('/Users/mark/Google Drive/DCE Clients/Emory University/2019-01 Repository Migration Development/5. Sample Data/yellowbacks_pull_list_Curate.csv', headers: true)
	# with_vols = pl.map.with_index{ \|r,i\| [i,r['Call Number/MSS Number (if applicable)'],r['ALMA MMSID']] if r["Call Number/MSS Number (if applicable)"].include?('V.')}
	# multi_vols = with_vols.select{\|r\| !r.nil?}
	# vol_mmsids = multi_vols.map{\|r\| r[2]}

	proto_csv = pl.map.with_index do \|row, index\|
	if row['ALMA MMSID'] # disregard blank rows in the CSV
	row_num = index+2 # gives row_num in origianl csv, accounting for header row and ruby index starting at 0 vs. spreadsheet starting at 1
	alma_mmsid = row['ALMA MMSID']
	call_no = row['CSV Call Number'] \|\| '.0'
	call_no_suffix = call_no[/(\.\d)$/] \|\| '.0'
	puts call_no_suffix
	title = row['CSV Title']
	key = mmsid_index(alma_mmsid, row_num, index)
	[key, row_num, alma_mmsid, call_no, call_no_suffix, title ]
	end
	end
	byebug
	ph = Hash[proto_csv[0..10].map{\|row\| [row[1], row]}]


	a = pl.map{\|r\| r['ALMA MMSID']}
	a.group_by(&:itself).map{\|k, v\| [k, v.length]}.select{\|r\| r[1]>1}
	multi_mmsids = a.group_by(&:itself).map{\|k, v\| [k, v.length]}.select{\|r\| r[1]>1}.map{\|r\| r[0]}.sort
	puts "Pull List Records"
	puts "Total Records:\t\t #{pl.count}"
	puts "Muli-volume works:\t #{multi_mmsids.count}"
	puts "Multi-volume work IDs:\t #{multi_mmsids}"


	# Find the MMSIDs that appear more than once in the Pull List

	# Make a list of MMSIDS that have multiple occurences
	# Show the volume numbers in Alma
	# Show the call numbers in Pull List

	# Search Alma for records with multiple ARK nodes
	# multi_arks = doc.xpath("//record[count(datafield[@tag='856'])>1]")
	# - capture the ID, volumeID, ark
	# - ID doc.xpath("//record[count(datafield[@tag='856'])>1][1]/controlfield[@tag='001']").text
	# - Title doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='245']/child::*").text
	# - VolumeID doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='856'][1]/subfield[@code='3']").text
	# - ARK doc.xpath("//record[count(datafield[@tag='856'])>1][1]/datafield[@tag='856'][1]/subfield[@code='u']").text

	puts "%-18s \| %-8s \| %-30s \| %-50s " % ["alma_mmsid","vol_id","vol_ark","title"]
	marc_with_multi_vols = doc.xpath("//record[count(datafield[@tag='856'])>1]/datafield[@tag='856']")
	marc_with_multi_vols.each do \|marc_record\|
	alma_mmsid = marc_record.xpath("ancestor::record/controlfield[@tag='001']").text
	title = marc_record.xpath("ancestor::record/datafield[@tag='245']/child::*").text
	volume_id = marc_record.xpath("subfield[@code='3']").text
	volume_ark = marc_record.xpath("subfield[@code='u']").text
	puts "%-18s \| %-8s \| %-30s \| %-50s " % [alma_mmsid,volume_id,volume_ark,title]
	# save these value instead of printing them...
	end

	byebug

	######
	#
	# plm = List of all MMSIDs from Pull List that occurr more than once
	# plm = a.group_by(&:itself).map{\|k, v\| [k, v.length]}.select{\|r\| r[1]>1}.map{\|r\| r[0]}.uniq.sort
	# alm = List of all MMSIDs from Marc Export that have more than one ark
	# alm = marc_with_multi_vols.map{\|node\| node.xpath("ancestor::record/controlfield[@tag='001']").text }.sort.uniq
	# union = (alm+plm).uniq.sort
	# no_ark = union-alm
	# no_pl = union-plm

	# record in pull list
	# matches record in alma
	# doesn't have corresponding alma record
	# doesn't have corresponding ark
	# has more orks than expected
	# record in alma
	# matches record in pull list

	# Read each line in the Pull List
	# extact the Row #, MMSID, Call No., Call No. suffix. Title
	#


	end
	end

	# Multiple arks in Alma, only 1 reference in Pull List
	# 990019157630302486 - Jonathan and his continent. (Rambles through American society) By Max O'Rell ... and Jack Allyn. Translated by Madame Paul Blouët.
	# 990019374560302486 - With harp and crown / a novel by Walter Besant and James Rice.
	# etc.
	#

	# mmsid - points to one or more indexed row records,
	# each indexed row corresponds to one line of the CSV


	# Acceptance
	# - [ ] 1 pull list row and 1 marc record with a single ark with matching mmsid, merge them into a single row
	# - [ ] 1 plull list row and no matching marc record, log an error
	# - [ ] 2 pull list rows with the same mmsid and a matching marc record with 2 matching volume ids, merge them into two separate rows
	# - [ ] 2 pull list rows with the same mmsid and 1 matching ark record, merge the first row, log an error for the second row
	# - [ ] more than 2 rows similar to 2 rows