Skip to content

Instantly share code, notes, and snippets.

@uasi
Created February 17, 2014 08:21
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save uasi/9046773 to your computer and use it in GitHub Desktop.
Save uasi/9046773 to your computer and use it in GitHub Desktop.
Dump HTML outline as YAML
require 'nokogiri'
require 'open-uri'
require 'optparse'
require 'yaml'
def pod_from_node(node)
data = YAML::Omap.new
data[:name] = node.node_name
attrs = YAML::Omap.new
node.attribute_nodes.sort_by(&:node_name).each do |attr|
attrs[attr.node_name] = attr.value
end
data[:attributes] = attrs if attrs.size > 0
children = node.children.select(&:element?).map {|child|
pod_from_node(child)
}
data[:children] = children if children.size > 0
data
end
def prune_html(html, rules)
rules.each do |rule|
type, selector = *rule
html.root.search(selector).each do |node|
if type == :exclude
node['html2yaml-exclude'] = 'yes'
else
node['html2yaml-exclude'] = 'no'
node.ancestors.each do |ancestor|
ancestor['html2yaml-exclude'] = 'no'
end
end
end
end
html.root.search('[html2yaml-exclude = yes]').remove
html.root.traverse {|node| node.remove_attribute('html2yaml-exclude') }
html
end
def main
rules = []
opt = OptionParser.new
opt.on('-i', '--include=XPATH_OR_CSS') {|v| rules << [:include, v] }
opt.on('-x', '--exclude=XPATH_OR_CSS') {|v| rules << [:exclude, v] }
opt.parse!(ARGV)
source = ARGV.shift or abort "Usage: html2yaml.rb URL_OR_PATH"
html = Nokogiri::HTML(open(source))
html = prune_html(html, rules)
data = pod_from_node(html.root)
puts YAML.dump(data)
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment