Last active
July 26, 2017 18:33
-
-
Save billdueber/fab7abb6c8df6718db05e03ba8519dac to your computer and use it in GitHub Desktop.
Monkeypatch of SolrEad to try to make indexing faster
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "URI" | |
require 'solr_ead' | |
class SolrEad::Indexer | |
def additional_component_fields(node, addl_fields = Hash.new) | |
p_ids = parent_id_list(node) | |
p_unittitles = parent_unittitle_list(node) | |
addl_fields["id"] = [eadid(node), node.attr("id")].join | |
addl_fields[Solrizer.solr_name("ead", :stored_sortable)] = eadid(node) | |
addl_fields[Solrizer.solr_name("parent", :stored_sortable)] = node.parent.attr("id") unless node.parent.attr("id").nil? | |
addl_fields[Solrizer.solr_name("parent", :displayable)] = p_ids | |
addl_fields[Solrizer.solr_name("parent_unittitles", :displayable)] = p_unittitles | |
addl_fields[Solrizer.solr_name("parent_unittitles", :searchable)] = p_unittitles | |
addl_fields[Solrizer.solr_name("component_level", :type => :integer)] = p_ids.length + 1 | |
addl_fields[Solrizer.solr_name("component_children", :type => :boolean)] = component_children?(node) | |
addl_fields[Solrizer.solr_name("collection", :facetable)] = collection(node) | |
addl_fields[Solrizer.solr_name("collection", :displayable)] = collection(node) | |
addl_fields[Solrizer.solr_name("repository", :facetable)] = repository(node) | |
addl_fields[Solrizer.solr_name("repository", :displayable)] = repository(node) | |
addl_fields | |
end | |
# can these be made to use absolute xpaths? | |
def repository(node) | |
@cached_repo ||= node.xpath("/ead/archdesc/did/repository").text.strip | |
end | |
def collection(node) | |
@cached_collection ||= node.xpath("/ead/archdesc/did/unittitle").text | |
end | |
def eadid(node) | |
@cached_eadid ||= node.xpath("/ead/eadheader/eadid").text | |
end | |
# Reset the caches when we get a new file | |
alias_method :original_components, :components | |
def components(*args) | |
STDERR.puts "****** Using the monkeypatch on file #{args}" | |
@memtitle = nil | |
@cached_eadid = nil | |
@cached_collection = nil | |
@cached_repo = nil | |
original_components(*args) | |
end | |
def parent_unittitle_list(node, results = ::Array.new) | |
while node.parent.name == "c" | |
parent = node.parent | |
results << get_title(parent) | |
node = parent | |
end | |
results.reverse | |
end | |
def get_title(node) | |
@memtitle ||= Hash.new {|h, node| h[node.object_id] = _get_title(node)} | |
@memtitle[node] | |
end | |
def _get_title(node) | |
title = node.at_xpath("./did/unittitle") | |
date = node.at_xpath("./did/unitdate") | |
if !title.nil? and !title.content.empty? | |
return ead_to_html(title.content) | |
elsif !date.nil? and !date.content.empty? | |
return ead_to_html(date.content) | |
else | |
return "[No title available]" | |
end | |
end | |
end | |
##### Testing ######### | |
# Only run this code if *this* file is run, not if it's | |
# required/loaded from another file. | |
if __FILE__==$0 | |
class SolrEad::Indexer | |
public :om_component_from_node | |
end | |
require 'benchmark' | |
ENV["SOLR_URL"] = "http://not.gonna.happen/solr" | |
filename = "umich-bhl-87365-2.xml" | |
indexer = SolrEad::Indexer.new | |
x = Benchmark.measure do | |
# Mimic making a solr document, without sending it to solr. | |
solr_doc = SolrEad::Document.from_xml(File.new(filename)) | |
indexer.components(filename).each_with_index do |node, counter| | |
solr_doc = indexer.om_component_from_node(node).to_solr(indexer.additional_component_fields(node)) | |
solr_doc.merge!({Solrizer.solr_name("sort", :sortable, :type => :integer) => counter.to_s}) | |
end | |
end | |
puts x | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment