Skip to content

Instantly share code, notes, and snippets.

@billdueber
Last active October 31, 2017 17:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save billdueber/5804a5afd918d77da3ec6a80cde8d297 to your computer and use it in GitHub Desktop.
Save billdueber/5804a5afd918d77da3ec6a80cde8d297 to your computer and use it in GitHub Desktop.
changed code and the simplistic config used for the benchmark
def extract_marc(spec, options = {})
# ... stuff deleted for clarity
## CREATE THE CHAIN
ppchain = Marc21.create_post_processing_chain(options, translation_map)
lambda do |record, accumulator, context|
accumulator.concat extractor.extract(record)
# USE THE PPCHAIN.. Just does `ppchain.inject(accumulator) {|acc, lam| lam.(acc)}`
accumulator.replace Marc21.apply_extraction_options(accumulator, ppchain)
end
end
module_function :extract_marc
ONLY_FIRST = ->(acc) do
puts "FIRSTING"
acc[0..0]
end
TRIM_PUNCT = ->(acc) do
puts "TRIMMING"
acc.map {|x| Marc21.trim_punctuation(x)}
end
DEDUP = ->(acc) do
acc.uniq
end
# Side-effect the accumulator with the options
def self.create_post_processing_chain(options, translation_map = nil)
only_first = options[:first]
trim_punctuation = options[:trim_punctuation]
default_value = options[:default]
allow_duplicates = options[:allow_duplicates]
chain = []
chain << ONLY_FIRST if only_first
if translation_map
mapper = ->(acc) { translation_map.translate_array(acc)}
chain << mapper
end
chain << TRIM_PUNCT if trim_punctuation
chain << DEDUP unless allow_duplicates
if options.has_key?(:default)
defaulter = ->(acc) { acc.empty? ? [ options[:default] ] : acc }
chain << defaulter
end
chain
end
def self.apply_extraction_options(accumulator, ppchain)
ppchain.inject(accumulator) {|acc, lam| lam.(acc)}
end
$:.unshift "#{File.dirname(__FILE__)}/../lib"
$:.unshift '.'
require 'set'
require 'library_stdnums'
require 'marc_record_speed_monkeypatch'
settings do
store "log.batch_progress", 10_000
store 'output_file', 'normal.txt'
store "mock_reader.limit", 100_000
end
logger.info RUBY_DESCRIPTION
################################
###### CORE FIELDS #############
################################
to_field "id", extract_marc("001", :first => true)
################################
######## IDENTIFIERS ###########
################################
to_field 'isbn', extract_marc('020az', :separator=>nil) do |rec, acc|
orig = acc.dup
acc.map!{|x| StdNum::ISBN.allNormalizedValues(x)}
acc << orig
acc.flatten!
acc.uniq!
end
to_field 'issn', extract_marc('022a:022l:022m:022y:022z:247x')
to_field 'isn_related', extract_marc("400x:410x:411x:440x:490x:500x:510x:534xz:556z:581z:700x:710x:711x:730x:760x:762x:765xz:767xz:770xz:772x:773xz:774xz:775xz:776xz:777x:780xz:785xz:786xz:787xz")
to_field 'sudoc', extract_marc('086az')
# UC started sending me leading spaces, so I need to do something
# about it.
to_field "lccn", extract_marc('010a') do |rec, acc|
acc.map! {|x| x.strip }
end
to_field 'rptnum', extract_marc('088a')
to_field 'barcode', extract_marc('974a')
################################
######### AUTHOR FIELDS ########
################################
to_field 'mainauthor', extract_marc('100abcd:110abcd:111abc')
to_field 'mainauthor_role', extract_marc('100e:110e:111e', :trim_punctuation => true)
to_field 'mainauthor_role', extract_marc('1004:1104:1114', :translation_map => "ht/relators")
################################
########## TITLES ##############
################################
# For titles, we want with and without
to_field 'title_c', extract_marc('245c')
to_field 'vtitle', extract_marc('245abdefghknp', :alternate_script=>:only, :trim_punctuation => true, :first=>true)
to_field "title_top", extract_marc("240adfghklmnoprs0:245abfgknps:247abfgknps:111acdefgjklnpqtu04:130adfgklmnoprst0")
to_field "title_rest", extract_marc("210ab:222ab:242abnpy:243adfgklmnoprs:246abdenp:247abdenp:700fgjklmnoprstx03:710fgklmnoprstx03:711acdefgjklnpqstux034:730adfgklmnoprstx03:740anp:765st:767st:770st:772st:773st:775st:776st:777st:780st:785st:786st:787st:830adfgklmnoprstv:440anpvx:490avx:505t")
to_field "series", extract_marc("440ap:800abcdfpqt:830ap")
to_field "series2", extract_marc("490a")
to_field "publisher", extract_marc('260b:264|*1|:533c')
to_field "edition", extract_marc('250a')
to_field 'language008', extract_marc('008[35-37]', :first=>true) do |r, acc|
acc.reject {|x| x !~ /\S/}.uniq
acc.uniq!
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment