Created
August 9, 2018 05:33
-
-
Save billdueber/c7f7de912ae990ee8e4b44f64838c40c to your computer and use it in GitHub Desktop.
Very rough look at parsing speed across marc and marc4j.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A very, *very* imperfect bench, but gives us a rough idea | |
# | |
# tl;dr -- marc-binary is a wash, marc-xml is about 3.5 times faster using marc4j | |
# | |
# > bundle exec ruby --server bench.rb | |
# jruby 9.2.0.0 (2.5.0) 2018-05-24 81156a8 Java HotSpot(TM) 64-Bit Server VM 25.112-b16 on 1.8.0_112-b16 +jit [darwin-x86_64] | |
# Warmup: 45. Runtime: 15 | |
# | |
# Comparison: | |
# marc4j-xml: 8197.4 i/s | |
# marc-binary: 5463.2 i/s - 1.50x slower | |
# marc4j-binary: 5262.6 i/s - 1.56x slower | |
# marc-xml: 2315.9 i/s - 3.54x slower | |
# | |
# Turning on invokedynamic didn't change the raw speed or the ratios outside of the stdev | |
require 'marc' | |
require 'traject' | |
require 'benchmark/ips' | |
require 'nokogiri' | |
# Need a way to not run out of records to parse. I just keep | |
# opening the input file over and over again as needed | |
class InfiniteBinaryIterator | |
include Enumerable | |
def self.iterator(klass, file) | |
instance = self.new(klass, file) | |
instance.enum_for(:each) | |
end | |
def initialize(klass, file) | |
@klass = klass | |
@file = file | |
end | |
def each | |
return enum_for(:each) unless block_given? | |
loop do | |
File.open(@file, 'rb') do |f| | |
reader = @klass.new(f, {}) | |
reader.each {|x| yield x} | |
end | |
end | |
end | |
end | |
class InfiniteXMLIterator < InfiniteBinaryIterator | |
def each | |
return enum_for(:each) unless block_given? | |
loop do | |
File.open(@file, 'r:utf-8') do |f| | |
reader = @klass.new(f, {'marc_source.type' => 'xml', "marc_reader.xml_parser"=> 'nokogiri'}) | |
reader.each {|x| yield x} | |
end | |
end | |
end | |
end | |
binary_file = '100_000.marc' | |
xml_file = '100_000.xml' | |
mb = InfiniteBinaryIterator.iterator(Traject::MarcReader, binary_file) | |
mx = InfiniteXMLIterator.iterator(Traject::MarcReader, xml_file) | |
m4jb = InfiniteBinaryIterator.iterator(Traject::Marc4JReader, binary_file) | |
m4jx = InfiniteXMLIterator.iterator(Traject::Marc4JReader, xml_file) | |
warmup = 45 | |
runtime = 15 | |
puts RUBY_DESCRIPTION | |
puts "Warmup: #{warmup}. Runtime: #{runtime}\n\n" | |
Benchmark.ips do |x| | |
x.config(:time => runtime, :warmup => warmup) | |
title_length = 0 | |
x.report('marc4j-binary') do | |
title_length += m4jb.next['245'].value.size | |
end | |
title_length = 0 | |
x.report('marc4j-xml') do | |
title_length += m4jx.next['245'].value.size | |
end | |
title_length = 0 | |
x.report('marc-binary') do | |
title_length += mb.next['245'].value.size | |
end | |
title_length = 0 | |
x.report('marc-xml') do | |
title_length += mx.next['245'].value.size | |
end | |
x.compare! | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment