Created
December 20, 2013 11:25
-
-
Save etoyoda/8053529 to your computer and use it in GitHub Desktop.
Ruby script to detect default namespace declaration in an OAI-PMH response containing metadata records in ISO/TS 19139.
Output is tab-separated text containing lines with two columns of (1) last namespace URI and (2) gmd:fileIdentifier/*/text().
Tested with ruby-1.8.7, libxml-ruby 1.1.3, and libxml2 20706.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'xml' | |
include XML | |
class Scanner | |
def initialize parent | |
@parent = parent | |
@reader = @xpath = nil | |
end | |
def check_atts | |
n = @reader.attribute_count | |
@reader.move_to_first_attribute | |
n.times do | |
if @reader.namespace_declaration? | |
@parent.nsdecl(@reader.name, @reader.value) | |
end | |
@reader.move_to_next_attribute | |
end | |
end | |
def open_tag | |
@xpath.push [@reader.local_name, @reader.namespace_uri] | |
@parent.open_rec if @parent.md_metadata?(@xpath) | |
end | |
def close_tag nametest = nil | |
@parent.close_rec if @parent.md_metadata?(@xpath) | |
name = @xpath.pop.first | |
return unless nametest | |
raise "BadXML <#{name}>...</#{nametest}>" unless name == nametest | |
end | |
def scanfile fpath | |
@reader = Reader.file(fpath) | |
@xpath = [] | |
while @reader.read | |
case @reader.node_type | |
when Reader::TYPE_TEXT | |
@parent.fid = @reader.value if @parent.fileIdentifier?(@xpath) | |
when Reader::TYPE_ELEMENT | |
open_tag | |
emptyp = @reader.empty_element? | |
check_atts if @reader.has_attributes? | |
close_tag if emptyp | |
when Reader::TYPE_END_ELEMENT | |
close_tag @reader.local_name | |
end | |
end | |
end | |
end | |
class App | |
def initialize | |
@scanner = Scanner.new(self) | |
@rec = nil | |
@count = 0 | |
end | |
def open_rec | |
close_rec if @rec | |
@count += 1 | |
@rec = {:nr => @count} | |
end | |
def close_rec | |
return unless @rec | |
puts [@rec['xmlns'], @rec[:fid]].join("\t") if @rec['xmlns'] | |
@rec = nil | |
end | |
def arg args | |
@scanner.scanfile args | |
close_rec | |
end | |
def fileIdentifier? xpath | |
xpath[-2] == ['fileIdentifier', 'http://www.isotc211.org/2005/gmd'] | |
end | |
def fid= fid | |
@rec[:fid] = fid | |
end | |
def md_metadata? xpath | |
xpath[-2] == ['metadata', 'http://www.openarchives.org/OAI/2.0/'] | |
end | |
def nsdecl prefix, uri | |
return unless @rec | |
@rec[prefix] = uri | |
end | |
end | |
app = App.new | |
ARGV.each {|args| app.arg(args) } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment