billdueber/marcquery.rb

## marcquery.rb
require 'parslet'

# A complex field-selection syntax to get MARC fields. Something I'm messing around
# with for a marc indexing process I'm thinking of building to replace marcspec
#
#
# spec := <tag>
#         <tag>!<ind><ind>
# tag  := '245' # literal string
#      := '6##' # use hashes to mean "any character"
#      := '[600,601,602]' # list in square brackets
#      := '[600-650]' # hyphen-delimited range
#      := '[010, 020, 022, 090-099]' # Combination
#
# ind  := '1' # literal
#      := '_' # underscore to mean a space
#      := '#' # hash to mean "any character"
#      := [1,3] # A list
#      := [0-9] #  range
#      := [1, 5-9] # combination


# Some examples
#
# 001 -- the 001
# 245 -- the 245
# [020,022,400,410-411,440,490] -- a bunch o standard identifiers
# 264!#1 -- RDA-style publisher
#

# Obviously, this only deals with the field level. Getting actual data out of the
# subfields is another kettle of fish. And turning it into an efficient mechanism
# for actually pulling fields out of a MARC record is yet another. Blargh.
#

class MARCQuery < Parslet::Parser
  rule(:space) {match['\s']}
  rule(:space?) { space.maybe }
  rule(:comma) { space? >> match['\s*,\s*'] >> space? }
  rule(:num)   {match['\d']}
  rule(:alnum) {match['[a-z0-9]']}
  rule(:hsh)   {str('#')}
  rule(:underscore) {str('_')}
  rule(:lb)    {str('[') >> space?}
  rule(:rb)    {space? >> str(']') }

  rule(:nrange) { num.as(:start) >> str('-') >> num.as(:end) }


  # Tags

  rule(:simple_tag) { alnum.repeat(3) }
  rule(:hashtag)    { (alnum|hsh).repeat(3)}
  rule(:rangetag) { simple_tag.as(:rtstart) >> str('-') >> simple_tag.as(:rtend) }
  rule(:generic_tag) { rangetag.as(:rangetag) |   simple_tag.as(:simple_tag) | hashtag.as(:hashtag) }

  rule(:tag_list_tail) { (comma >> generic_tag).repeat }
  rule(:tag_list)  {lb >> generic_tag >> tag_list_tail >> rb }

  rule(:tag) { (tag_list | generic_tag).as(:tag) }


  # Indicators
  rule(:simple_indicator)  { num | hsh | underscore }
  rule(:range_indicator)   { nrange.as(:irange) }
  rule(:generic_indicator) { range_indicator | simple_indicator.as(:iscalar)}
  rule(:indicator_list_tail) { (comma >>  generic_indicator).repeat }
  rule(:indicator_list)    { lb >> generic_indicator >> indicator_list_tail >> rb }

  rule(:indicator) { indicator_list | generic_indicator }


  # Tag plus two indicators (optional separated from the tag by a '!')
  rule(:tagi) { (tag >> str('!') >> indicator.as(:ind1) >> indicator.as(:ind2)) | tag }

  root(:tagi)
end


class MARCQueryTransform < Parslet::Transform

  # Note: use re =~ str for the best speed
  def self.hashtag_to_regexp(ht)
    Regexp.new(ht.to_s.gsub('#', '[[:alnum:]]'), 'i')
  end

  @valid_indicator = nil

  def self.i2r(ind)
    ind = ind.to_s
    case ind
    when '_'
      ' '
    when '#'
      @valid_indicator
    else
      ind
    end
  end

  # Tag

  rule(:simple_tag => simple(:x)) { x.to_s }

  # hashtag (e.g., 6##)
  rule(:hashtag => simple(:x)) { |d| hashtag_to_regexp(d[:x])}

  # range {:tag=>{:rangetag=>{:rtstart=>"600"@1, :rtend=>"650"@5}}}
  rule(:rtstart=>simple(:s), :rtend=>simple(:e)) { (s.to_s..e.to_s) }
  rule(:rangetag => simple(:r)) { r }

  # Indicators
  rule(:iscalar => simple(:x)) { |d| i2r(d[:x]) }
  rule(:irange => {:start=>simple(:s), :end=>simple(:e)}) { (s.to_s..e.to_s) }
  rule(:li => subtree(:x)) { x.flatten }


end

p = MARCQuery.new
t = MARCQueryTransform.new

['245', '6##', '[600-650]', '[600, 601]', '[600, 601, 602]', '6##',  '[100,110-130,140]',
'[100,110,120]!1#',
'100!_3',
'245!#[1-9]',
'245!#[1-3,9]',
].each do |x|
  puts x
  puts p.parse(x)
  puts t.apply(p.parse(x)).inspect
  puts "\n"
end
	require 'parslet'

	# A complex field-selection syntax to get MARC fields. Something I'm messing around
	# with for a marc indexing process I'm thinking of building to replace marcspec
	#
	#
	# spec := <tag>
	# <tag>!<ind><ind>
	# tag := '245' # literal string
	# := '6##' # use hashes to mean "any character"
	# := '[600,601,602]' # list in square brackets
	# := '[600-650]' # hyphen-delimited range
	# := '[010, 020, 022, 090-099]' # Combination
	#
	# ind := '1' # literal
	# := '_' # underscore to mean a space
	# := '#' # hash to mean "any character"
	# := [1,3] # A list
	# := [0-9] # range
	# := [1, 5-9] # combination


	# Some examples
	#
	# 001 -- the 001
	# 245 -- the 245
	# [020,022,400,410-411,440,490] -- a bunch o standard identifiers
	# 264!#1 -- RDA-style publisher
	#

	# Obviously, this only deals with the field level. Getting actual data out of the
	# subfields is another kettle of fish. And turning it into an efficient mechanism
	# for actually pulling fields out of a MARC record is yet another. Blargh.
	#

	class MARCQuery < Parslet::Parser
	rule(:space) {match['\s']}
	rule(:space?) { space.maybe }
	rule(:comma) { space? >> match['\s,\s'] >> space? }
	rule(:num) {match['\d']}
	rule(:alnum) {match['[a-z0-9]']}
	rule(:hsh) {str('#')}
	rule(:underscore) {str('_')}
	rule(:lb) {str('[') >> space?}
	rule(:rb) {space? >> str(']') }

	rule(:nrange) { num.as(:start) >> str('-') >> num.as(:end) }


	# Tags

	rule(:simple_tag) { alnum.repeat(3) }
	rule(:hashtag) { (alnum\|hsh).repeat(3)}
	rule(:rangetag) { simple_tag.as(:rtstart) >> str('-') >> simple_tag.as(:rtend) }
	rule(:generic_tag) { rangetag.as(:rangetag) \| simple_tag.as(:simple_tag) \| hashtag.as(:hashtag) }

	rule(:tag_list_tail) { (comma >> generic_tag).repeat }
	rule(:tag_list) {lb >> generic_tag >> tag_list_tail >> rb }

	rule(:tag) { (tag_list \| generic_tag).as(:tag) }



	# Indicators
	rule(:simple_indicator) { num \| hsh \| underscore }
	rule(:range_indicator) { nrange.as(:irange) }
	rule(:generic_indicator) { range_indicator \| simple_indicator.as(:iscalar)}
	rule(:indicator_list_tail) { (comma >> generic_indicator).repeat }
	rule(:indicator_list) { lb >> generic_indicator >> indicator_list_tail >> rb }

	rule(:indicator) { indicator_list \| generic_indicator }


	# Tag plus two indicators (optional separated from the tag by a '!')
	rule(:tagi) { (tag >> str('!') >> indicator.as(:ind1) >> indicator.as(:ind2)) \| tag }

	root(:tagi)
	end


	class MARCQueryTransform < Parslet::Transform

	# Note: use re =~ str for the best speed
	def self.hashtag_to_regexp(ht)
	Regexp.new(ht.to_s.gsub('#', '[[:alnum:]]'), 'i')
	end

	@valid_indicator = nil

	def self.i2r(ind)
	ind = ind.to_s
	case ind
	when '_'
	' '
	when '#'
	@valid_indicator
	else
	ind
	end
	end

	# Tag

	rule(:simple_tag => simple(:x)) { x.to_s }

	# hashtag (e.g., 6##)
	rule(:hashtag => simple(:x)) { \|d\| hashtag_to_regexp(d[:x])}

	# range {:tag=>{:rangetag=>{:rtstart=>"600"@1, :rtend=>"650"@5}}}
	rule(:rtstart=>simple(:s), :rtend=>simple(:e)) { (s.to_s..e.to_s) }
	rule(:rangetag => simple(:r)) { r }

	# Indicators
	rule(:iscalar => simple(:x)) { \|d\| i2r(d[:x]) }
	rule(:irange => {:start=>simple(:s), :end=>simple(:e)}) { (s.to_s..e.to_s) }
	rule(:li => subtree(:x)) { x.flatten }


	end

	p = MARCQuery.new
	t = MARCQueryTransform.new

	['245', '6##', '[600-650]', '[600, 601]', '[600, 601, 602]', '6##', '[100,110-130,140]',
	'[100,110,120]!1#',
	'100!_3',
	'245!#[1-9]',
	'245!#[1-3,9]',
	].each do \|x\|
	puts x
	puts p.parse(x)
	puts t.apply(p.parse(x)).inspect
	puts "\n"
	end