Skip to content

Instantly share code, notes, and snippets.

@afiore
Last active August 29, 2015 13:57
Show Gist options
  • Save afiore/9565270 to your computer and use it in GitHub Desktop.
Save afiore/9565270 to your computer and use it in GitHub Desktop.
A generic (flat) collection parser using Nokogiri's SAX capabilities
require 'nokogiri'
class CollectionParser < Nokogiri::XML::SAX::Document
attr_reader :results
# Initializes a generic SAX parser for extracting collections from XML
#
# @param item_tag [String]
# @param elems [Hash<String=>[String]>]
def initialize(item_tag, el_attrs)
@item_tag = item_tag
@el_attrs = el_attrs
@elems = el_attrs.keys
@results = []
@current_item = nil
@current_elem = nil
@current_elem_value = ""
end
def start_element(el, attr_pairs)
case el
when @item_tag
@current_item = {}
when *@elems
set_attributes(el, attr_pairs)
@current_elem = el
end
end
def end_element(el)
case el
when @item_tag
append_current_item
when *@elems
set_tag_value
end
end
def characters(cs)
append_text cs
end
def cdata_block(cs)
append_text cs
end
private
def append_current_item
@results << @current_item.dup
@current_elem = nil
@current_elem_value = ""
end
def set_tag_value
@current_item.merge!(@current_elem => @current_elem_value) if is_relevant_tag?
@current_elem_value = ""
end
def set_attributes(el, attr_pairs)
return unless @current_item && !attr_pairs.empty?
attrs = attr_pairs.reduce({}) do |acc, (attr, v)|
if is_relevant_attribute?(el, attr)
acc.merge("#{el}_#{attr}" => v)·
else
acc
end
end
@current_item.merge!(attrs)
end
def is_relevant_attribute?(el, attr)
@el_attrs[el].include?(attr)
end
def is_relevant_tag?
@current_item && @current_elem
end
def append_text(cs)
@current_elem_value << cs if @current_elem
end
end
require_relative './collection_parser'
require 'open-uri'
el_attrs = {
'title' => [],
'link' => [],
'guid' => ['isPermaLink'],
'pubDate' => [],
'description' => []
}
xml = open("http://redit.com/r/bigdata.rss?limit=100").read
feed = CollectionParser.new('item', el_attrs)
parser = Nokogiri::XML::SAX::Parser.new(feed)
parser.parse(xml)
feed.results.last
#=> {"title"=>"The Big Data Showstoppers: Frameworks", "link"=>"http://www.reddit.com/r/bigdata/comments/1x658q/the_big_data_showstoppers_frameworks/", "guid_isPermaLink"=>"true", "guid"=>"...", "pubDate"=>"Thu, 06 Feb 2014 02:31:07 -0800", "description"=>"..."}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment