Skip to content

Instantly share code, notes, and snippets.

@billdueber
Last active July 26, 2017 18:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save billdueber/fab7abb6c8df6718db05e03ba8519dac to your computer and use it in GitHub Desktop.
Save billdueber/fab7abb6c8df6718db05e03ba8519dac to your computer and use it in GitHub Desktop.
Monkeypatch of SolrEad to try to make indexing faster
require "URI"
require 'solr_ead'
class SolrEad::Indexer
def additional_component_fields(node, addl_fields = Hash.new)
p_ids = parent_id_list(node)
p_unittitles = parent_unittitle_list(node)
addl_fields["id"] = [eadid(node), node.attr("id")].join
addl_fields[Solrizer.solr_name("ead", :stored_sortable)] = eadid(node)
addl_fields[Solrizer.solr_name("parent", :stored_sortable)] = node.parent.attr("id") unless node.parent.attr("id").nil?
addl_fields[Solrizer.solr_name("parent", :displayable)] = p_ids
addl_fields[Solrizer.solr_name("parent_unittitles", :displayable)] = p_unittitles
addl_fields[Solrizer.solr_name("parent_unittitles", :searchable)] = p_unittitles
addl_fields[Solrizer.solr_name("component_level", :type => :integer)] = p_ids.length + 1
addl_fields[Solrizer.solr_name("component_children", :type => :boolean)] = component_children?(node)
addl_fields[Solrizer.solr_name("collection", :facetable)] = collection(node)
addl_fields[Solrizer.solr_name("collection", :displayable)] = collection(node)
addl_fields[Solrizer.solr_name("repository", :facetable)] = repository(node)
addl_fields[Solrizer.solr_name("repository", :displayable)] = repository(node)
addl_fields
end
# can these be made to use absolute xpaths?
def repository(node)
@cached_repo ||= node.xpath("/ead/archdesc/did/repository").text.strip
end
def collection(node)
@cached_collection ||= node.xpath("/ead/archdesc/did/unittitle").text
end
def eadid(node)
@cached_eadid ||= node.xpath("/ead/eadheader/eadid").text
end
# Reset the caches when we get a new file
alias_method :original_components, :components
def components(*args)
STDERR.puts "****** Using the monkeypatch on file #{args}"
@memtitle = nil
@cached_eadid = nil
@cached_collection = nil
@cached_repo = nil
original_components(*args)
end
def parent_unittitle_list(node, results = ::Array.new)
while node.parent.name == "c"
parent = node.parent
results << get_title(parent)
node = parent
end
results.reverse
end
def get_title(node)
@memtitle ||= Hash.new {|h, node| h[node.object_id] = _get_title(node)}
@memtitle[node]
end
def _get_title(node)
title = node.at_xpath("./did/unittitle")
date = node.at_xpath("./did/unitdate")
if !title.nil? and !title.content.empty?
return ead_to_html(title.content)
elsif !date.nil? and !date.content.empty?
return ead_to_html(date.content)
else
return "[No title available]"
end
end
end
##### Testing #########
# Only run this code if *this* file is run, not if it's
# required/loaded from another file.
if __FILE__==$0
class SolrEad::Indexer
public :om_component_from_node
end
require 'benchmark'
ENV["SOLR_URL"] = "http://not.gonna.happen/solr"
filename = "umich-bhl-87365-2.xml"
indexer = SolrEad::Indexer.new
x = Benchmark.measure do
# Mimic making a solr document, without sending it to solr.
solr_doc = SolrEad::Document.from_xml(File.new(filename))
indexer.components(filename).each_with_index do |node, counter|
solr_doc = indexer.om_component_from_node(node).to_solr(indexer.additional_component_fields(node))
solr_doc.merge!({Solrizer.solr_name("sort", :sortable, :type => :integer) => counter.to_s})
end
end
puts x
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment