Created
July 27, 2012 19:37
-
-
Save chriseppstein/3190080 to your computer and use it in GitHub Desktop.
This ruby script will summarize the html5 semantic structure of a webpage so that you can more easily ensure the page is correct.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
url = ARGV[0] | |
class Document < Nokogiri::XML::SAX::Document | |
SEMANTIC_CONTAINERS = %w(body article section nav aside hgroup header footer) | |
COUNT_ELEMENTS = %w(p a) | |
TEXTUAL_ELEMENTS = %w(h1 h2 h3 h4 h5 h6 title) | |
def initialize(*args) | |
super | |
@depth = 0 | |
@text = nil | |
@counts = [] | |
@last_headings = [] | |
end | |
def start_element(name, attributes = []) | |
@id = nil | |
attributes.each do |(attr_name, value)| | |
case attr_name | |
when "id" | |
@id = value | |
end | |
end | |
if SEMANTIC_CONTAINERS.include?(name) | |
start_semantic_container(name) | |
elsif COUNT_ELEMENTS.include?(name) | |
@counts.last[name] ||= 0 | |
@counts.last[name] += 1 | |
elsif TEXTUAL_ELEMENTS.include?(name) | |
@text = "" | |
end | |
end | |
def start_semantic_container(name) | |
emit_tag(name) | |
@depth += 1 | |
@counts << {} | |
@text = nil | |
@last_headings << nil | |
end | |
def end_semantic_container(name) | |
@depth -= 1 | |
@counts.last.each do |counted, count| | |
puts "#{" " * (@depth + 1)}#{counted} - #{count} times" | |
end | |
@counts.pop | |
@last_headings.pop | |
end | |
def characters(text) | |
@text << text.strip if @text | |
end | |
def end_element(name) | |
if SEMANTIC_CONTAINERS.include?(name) | |
end_semantic_container(name) | |
elsif TEXTUAL_ELEMENTS.include?(name) | |
emit_tag(name) | |
@text = nil | |
end | |
end | |
private | |
def emit_tag(name) | |
tag = "#{" " * @depth}#{name}#{"#" + @id if @id}" | |
tag << %Q{: "#{@text}"} if @text && @text.length > 0 | |
puts tag | |
end | |
end | |
parser = Nokogiri::HTML::SAX::Parser.new(Document.new) | |
parser.parse(open(url).read) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
title: "Addressing Daily Home Care | Caring.com" | |
body#articles_show | |
header | |
a - 5 times | |
nav#top-nav-element | |
a - 36 times | |
aside#promo-phone-number-banner | |
h5: "Talk to a Caring Advisor for free:" | |
aside | |
h6: "Advertisement" | |
a - 1 times | |
aside | |
h6: "Page Sponsored By" | |
a - 1 times | |
aside | |
h6: "Advertisement" | |
a - 1 times | |
article | |
h1: "Providing Home Care for an Older Adult: A Good Fit?" | |
h2: "What to consider when an older adult needs daily care at home" | |
h3: "Will it work on a practical level for her to live at home?" | |
h3: "Can you or she afford it?" | |
h2: "Do you have enough caregiving and emotional support to provide daily care to an older adult?" | |
footer | |
h6#rating-control: "Was this article helpful?" | |
nav | |
h5: "Recommended for You" | |
aside | |
h6: "Sponsored Content" | |
a - 1 times | |
a - 5 times | |
a - 5 times | |
aside | |
h6: "Comments" | |
section | |
article | |
header | |
a - 1 times | |
h6: "Lorem ipsum dolor sit amet, consectetur" | |
footer | |
a - 4 times | |
article | |
header | |
a - 1 times | |
h6: "For almost everyone, from children" | |
footer | |
a - 4 times | |
article | |
header | |
a - 1 times | |
h6: "When my mother ran out of money," | |
footer | |
a - 4 times | |
p - 6 times | |
a - 9 times | |
aside | |
h6: "Advertisement" | |
a - 1 times | |
aside#right-rail-review-widget | |
h5#widget-title: "Latest Senior Care Reviews" | |
h6: "Find More Providers Near You" | |
a - 3 times | |
aside | |
h6: "Advertisement" | |
a - 1 times | |
aside#klippy | |
h6: "Related Topics" | |
a - 3 times | |
aside#stay-connected-unit | |
h6: "Stay Connected With Caring.com" | |
a - 2 times | |
aside#footer-gallery | |
h2: "Also from Caring.com" | |
p - 20 times | |
a - 40 times | |
aside#action-bar | |
h6: "Share this page" | |
a - 7 times | |
footer#footer | |
nav | |
h2: "General Info" | |
h2: "Quick Links" | |
h2: "Site Help" | |
a - 20 times |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment