Last active
October 31, 2017 17:20
-
-
Save billdueber/5804a5afd918d77da3ec6a80cde8d297 to your computer and use it in GitHub Desktop.
changed code and the simplistic config used for the benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_marc(spec, options = {}) | |
# ... stuff deleted for clarity | |
## CREATE THE CHAIN | |
ppchain = Marc21.create_post_processing_chain(options, translation_map) | |
lambda do |record, accumulator, context| | |
accumulator.concat extractor.extract(record) | |
# USE THE PPCHAIN.. Just does `ppchain.inject(accumulator) {|acc, lam| lam.(acc)}` | |
accumulator.replace Marc21.apply_extraction_options(accumulator, ppchain) | |
end | |
end | |
module_function :extract_marc | |
ONLY_FIRST = ->(acc) do | |
puts "FIRSTING" | |
acc[0..0] | |
end | |
TRIM_PUNCT = ->(acc) do | |
puts "TRIMMING" | |
acc.map {|x| Marc21.trim_punctuation(x)} | |
end | |
DEDUP = ->(acc) do | |
acc.uniq | |
end | |
# Side-effect the accumulator with the options | |
def self.create_post_processing_chain(options, translation_map = nil) | |
only_first = options[:first] | |
trim_punctuation = options[:trim_punctuation] | |
default_value = options[:default] | |
allow_duplicates = options[:allow_duplicates] | |
chain = [] | |
chain << ONLY_FIRST if only_first | |
if translation_map | |
mapper = ->(acc) { translation_map.translate_array(acc)} | |
chain << mapper | |
end | |
chain << TRIM_PUNCT if trim_punctuation | |
chain << DEDUP unless allow_duplicates | |
if options.has_key?(:default) | |
defaulter = ->(acc) { acc.empty? ? [ options[:default] ] : acc } | |
chain << defaulter | |
end | |
chain | |
end | |
def self.apply_extraction_options(accumulator, ppchain) | |
ppchain.inject(accumulator) {|acc, lam| lam.(acc)} | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$:.unshift "#{File.dirname(__FILE__)}/../lib" | |
$:.unshift '.' | |
require 'set' | |
require 'library_stdnums' | |
require 'marc_record_speed_monkeypatch' | |
settings do | |
store "log.batch_progress", 10_000 | |
store 'output_file', 'normal.txt' | |
store "mock_reader.limit", 100_000 | |
end | |
logger.info RUBY_DESCRIPTION | |
################################ | |
###### CORE FIELDS ############# | |
################################ | |
to_field "id", extract_marc("001", :first => true) | |
################################ | |
######## IDENTIFIERS ########### | |
################################ | |
to_field 'isbn', extract_marc('020az', :separator=>nil) do |rec, acc| | |
orig = acc.dup | |
acc.map!{|x| StdNum::ISBN.allNormalizedValues(x)} | |
acc << orig | |
acc.flatten! | |
acc.uniq! | |
end | |
to_field 'issn', extract_marc('022a:022l:022m:022y:022z:247x') | |
to_field 'isn_related', extract_marc("400x:410x:411x:440x:490x:500x:510x:534xz:556z:581z:700x:710x:711x:730x:760x:762x:765xz:767xz:770xz:772x:773xz:774xz:775xz:776xz:777x:780xz:785xz:786xz:787xz") | |
to_field 'sudoc', extract_marc('086az') | |
# UC started sending me leading spaces, so I need to do something | |
# about it. | |
to_field "lccn", extract_marc('010a') do |rec, acc| | |
acc.map! {|x| x.strip } | |
end | |
to_field 'rptnum', extract_marc('088a') | |
to_field 'barcode', extract_marc('974a') | |
################################ | |
######### AUTHOR FIELDS ######## | |
################################ | |
to_field 'mainauthor', extract_marc('100abcd:110abcd:111abc') | |
to_field 'mainauthor_role', extract_marc('100e:110e:111e', :trim_punctuation => true) | |
to_field 'mainauthor_role', extract_marc('1004:1104:1114', :translation_map => "ht/relators") | |
################################ | |
########## TITLES ############## | |
################################ | |
# For titles, we want with and without | |
to_field 'title_c', extract_marc('245c') | |
to_field 'vtitle', extract_marc('245abdefghknp', :alternate_script=>:only, :trim_punctuation => true, :first=>true) | |
to_field "title_top", extract_marc("240adfghklmnoprs0:245abfgknps:247abfgknps:111acdefgjklnpqtu04:130adfgklmnoprst0") | |
to_field "title_rest", extract_marc("210ab:222ab:242abnpy:243adfgklmnoprs:246abdenp:247abdenp:700fgjklmnoprstx03:710fgklmnoprstx03:711acdefgjklnpqstux034:730adfgklmnoprstx03:740anp:765st:767st:770st:772st:773st:775st:776st:777st:780st:785st:786st:787st:830adfgklmnoprstv:440anpvx:490avx:505t") | |
to_field "series", extract_marc("440ap:800abcdfpqt:830ap") | |
to_field "series2", extract_marc("490a") | |
to_field "publisher", extract_marc('260b:264|*1|:533c') | |
to_field "edition", extract_marc('250a') | |
to_field 'language008', extract_marc('008[35-37]', :first=>true) do |r, acc| | |
acc.reject {|x| x !~ /\S/}.uniq | |
acc.uniq! | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment