Created
July 15, 2013 19:06
-
-
Save billdueber/6002510 to your computer and use it in GitHub Desktop.
Simple parslet parser for same MARC field (not subfield) query string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'parslet' | |
# A complex field-selection syntax to get MARC fields. Something I'm messing around | |
# with for a marc indexing process I'm thinking of building to replace marcspec | |
# | |
# | |
# spec := <tag> | |
# <tag>!<ind><ind> | |
# tag := '245' # literal string | |
# := '6##' # use hashes to mean "any character" | |
# := '[600,601,602]' # list in square brackets | |
# := '[600-650]' # hyphen-delimited range | |
# := '[010, 020, 022, 090-099]' # Combination | |
# | |
# ind := '1' # literal | |
# := '_' # underscore to mean a space | |
# := '#' # hash to mean "any character" | |
# := [1,3] # A list | |
# := [0-9] # range | |
# := [1, 5-9] # combination | |
# Some examples | |
# | |
# 001 -- the 001 | |
# 245 -- the 245 | |
# [020,022,400,410-411,440,490] -- a bunch o standard identifiers | |
# 264!#1 -- RDA-style publisher | |
# | |
# Obviously, this only deals with the field level. Getting actual data out of the | |
# subfields is another kettle of fish. And turning it into an efficient mechanism | |
# for actually pulling fields out of a MARC record is yet another. Blargh. | |
# | |
class MARCQuery < Parslet::Parser | |
rule(:space) {match['\s']} | |
rule(:space?) { space.maybe } | |
rule(:comma) { space? >> match['\s*,\s*'] >> space? } | |
rule(:num) {match['\d']} | |
rule(:alnum) {match['[a-z0-9]']} | |
rule(:hsh) {str('#')} | |
rule(:underscore) {str('_')} | |
rule(:lb) {str('[') >> space?} | |
rule(:rb) {space? >> str(']') } | |
rule(:nrange) { num.as(:start) >> str('-') >> num.as(:end) } | |
# Tags | |
rule(:simple_tag) { alnum.repeat(3) } | |
rule(:hashtag) { (alnum|hsh).repeat(3)} | |
rule(:rangetag) { simple_tag.as(:rtstart) >> str('-') >> simple_tag.as(:rtend) } | |
rule(:generic_tag) { rangetag.as(:rangetag) | simple_tag.as(:simple_tag) | hashtag.as(:hashtag) } | |
rule(:tag_list_tail) { (comma >> generic_tag).repeat } | |
rule(:tag_list) {lb >> generic_tag >> tag_list_tail >> rb } | |
rule(:tag) { (tag_list | generic_tag).as(:tag) } | |
# Indicators | |
rule(:simple_indicator) { num | hsh | underscore } | |
rule(:range_indicator) { nrange.as(:irange) } | |
rule(:generic_indicator) { range_indicator | simple_indicator.as(:iscalar)} | |
rule(:indicator_list_tail) { (comma >> generic_indicator).repeat } | |
rule(:indicator_list) { lb >> generic_indicator >> indicator_list_tail >> rb } | |
rule(:indicator) { indicator_list | generic_indicator } | |
# Tag plus two indicators (optional separated from the tag by a '!') | |
rule(:tagi) { (tag >> str('!') >> indicator.as(:ind1) >> indicator.as(:ind2)) | tag } | |
root(:tagi) | |
end | |
class MARCQueryTransform < Parslet::Transform | |
# Note: use re =~ str for the best speed | |
def self.hashtag_to_regexp(ht) | |
Regexp.new(ht.to_s.gsub('#', '[[:alnum:]]'), 'i') | |
end | |
@valid_indicator = nil | |
def self.i2r(ind) | |
ind = ind.to_s | |
case ind | |
when '_' | |
' ' | |
when '#' | |
@valid_indicator | |
else | |
ind | |
end | |
end | |
# Tag | |
rule(:simple_tag => simple(:x)) { x.to_s } | |
# hashtag (e.g., 6##) | |
rule(:hashtag => simple(:x)) { |d| hashtag_to_regexp(d[:x])} | |
# range {:tag=>{:rangetag=>{:rtstart=>"600"@1, :rtend=>"650"@5}}} | |
rule(:rtstart=>simple(:s), :rtend=>simple(:e)) { (s.to_s..e.to_s) } | |
rule(:rangetag => simple(:r)) { r } | |
# Indicators | |
rule(:iscalar => simple(:x)) { |d| i2r(d[:x]) } | |
rule(:irange => {:start=>simple(:s), :end=>simple(:e)}) { (s.to_s..e.to_s) } | |
rule(:li => subtree(:x)) { x.flatten } | |
end | |
p = MARCQuery.new | |
t = MARCQueryTransform.new | |
['245', '6##', '[600-650]', '[600, 601]', '[600, 601, 602]', '6##', '[100,110-130,140]', | |
'[100,110,120]!1#', | |
'100!_3', | |
'245!#[1-9]', | |
'245!#[1-3,9]', | |
].each do |x| | |
puts x | |
puts p.parse(x) | |
puts t.apply(p.parse(x)).inspect | |
puts "\n" | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment