Skip to content

Instantly share code, notes, and snippets.

@billdueber
Last active July 27, 2017 19:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save billdueber/027b6f9c35623889bb2d42c091c6efa3 to your computer and use it in GitHub Desktop.
Save billdueber/027b6f9c35623889bb2d42c091c6efa3 to your computer and use it in GitHub Desktop.
A self-contained (read: monkeypatch)benchmarking program for SolrEad based on https://github.com/awead/solr_ead/pull/20
require 'benchmark'
require 'uri'
require 'solr_ead'
require 'concurrent'
# Make a subclass with all the speed patches
class IndexerWithPatches < SolrEad::Indexer
def additional_component_fields(node, addl_fields = Hash.new)
# Clear or create the cache
@cache = {}
p_ids = parent_id_list(node)
p_unittitles = parent_unittitle_list(node)
addl_fields["id"] = [eadid(node), node.attr("id")].join
addl_fields[Solrizer.solr_name("ead", :stored_sortable)] = eadid(node)
addl_fields[Solrizer.solr_name("parent", :stored_sortable)] = node.parent.attr("id") unless node.parent.attr("id").nil?
addl_fields[Solrizer.solr_name("parent", :displayable)] = p_ids
addl_fields[Solrizer.solr_name("parent_unittitles", :displayable)] = p_unittitles
addl_fields[Solrizer.solr_name("parent_unittitles", :searchable)] = p_unittitles
addl_fields[Solrizer.solr_name("component_level", :type => :integer)] = p_ids.length + 1
addl_fields[Solrizer.solr_name("component_children", :type => :boolean)] = component_children?(node)
addl_fields[Solrizer.solr_name("collection", :facetable)] = collection(node)
addl_fields[Solrizer.solr_name("collection", :displayable)] = collection(node)
addl_fields[Solrizer.solr_name("repository", :facetable)] = repository(node)
addl_fields[Solrizer.solr_name("repository", :displayable)] = repository(node)
addl_fields
end
# can these be made to use absolute xpaths?
def repository(node)
@cache[:repo] ||= node.xpath("/ead/archdesc/did/repository").text.strip
end
def collection(node)
@cache[:collection] ||= node.xpath("/ead/archdesc/did/unittitle").text
end
def eadid(node)
@cache[:eadid] ||= node.xpath("/ead/eadheader/eadid").text
end
def parent_unittitle_list(node, results = ::Array.new)
while node.parent.name == "c"
parent = node.parent
results << get_title(parent)
node = parent
end
results.reverse
end
def get_title(node)
@memtitle ||= Hash.new {|h, node| h[node.object_id] = _get_title(node)}
@memtitle[node]
end
def _get_title(node)
title = node.at_xpath("./did/unittitle")
date = node.at_xpath("./did/unitdate")
if !title.nil? and !title.content.empty?
return ead_to_html(title.content)
elsif !date.nil? and !date.content.empty?
return ead_to_html(date.content)
else
return "[No title available]"
end
end
end
# Create a decent mimic of the processes needed to
# create a solr document and the component documents
def fake_solr_doc(indexer, filename)
components = indexer.components(filename)
# STDERR.puts "Working on #{filename} with #{components.size} components"
indexer.components(filename).each_with_index do |c, i|
acf = indexer.additional_component_fields(c)
end
end
# We're not actually talking to Solr, but need to set something
ENV["SOLR_URL"] = "http://not.gonna.happen/solr"
# Get filename(s) off the command line and benchmark them
filenames = ARGV
if filenames.empty?
puts "Need to pass along at least one filename of an EAD XML file to benchmark"
exit(1)
end
# Get a couple indexers
stock_indexer = SolrEad::Indexer.new
patched_indexer = IndexerWithPatches.new
def size_in_k(fname)
File.size(fname) / 1024.0
end
stock_total = 0
patched_total = 0
stock_size = 0
stock_comp = 0
patched_size = 0
patched_comp = 0
TIMEOUT = 240
$stdout.sync = true
puts <<"NOTES"
#{RUBY_DESCRIPTION}
NOTE: We give up after #{TIMEOUT} seconds
Skipping everything where stock had a sub-second processing time
NOTES
FORMAT = "%-25s %10.2f %7d %7s %7s %7s"
puts "%-25s %10s %7s %7s %7s %7s" % ["EAD File", "Size (KB)", "Comps", "Stock", "Patched", "Speedup"]
puts '-' * 68
filenames.each do |fn|
name = File.basename(fn)
size = size_in_k(fn)
comp = stock_indexer.components(fn).size
stock_p = Concurrent::Promise.execute { Benchmark.realtime { doc = fake_solr_doc(stock_indexer, fn)} }
stock_v = stock_p.value(TIMEOUT)
stock = if stock_v
"%7.2f" % stock_v
else
"***"
end
patched_p = Concurrent::Promise.execute { Benchmark.realtime {doc = fake_solr_doc(patched_indexer, fn)} }
patched_v = patched_p.value(TIMEOUT)
next if stock_v and patched_v and stock_v < 1 and patched_v < 1
patched = if patched_v
p = "%7.2f" % patched_v
else
" - "
end
mul = if stock_v and patched_p
"%3.0fx" % (stock_v / patched_v)
else
" - "
end
puts FORMAT % [name, size, comp, stock, patched, mul ]
if stock_v
stock_total += stock_v
stock_size += size
stock_comp += comp
end
if patched_v
patched_total += patched_v
patched_size += size
patched_comp += comp
end
end
puts
puts "%-25s %10.2f %7d %7.2f %7s" % ["Stock Totals", stock_size, stock_comp, stock_total, "--"]
puts "%-25s %10.2f %7d %7s %7.2f" % ["Patched Totals", patched_size, patched_comp, "--", patched_total]
puts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment