Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JoshCheek/b685aaafcb8a0ab6619b9893f576a73a to your computer and use it in GitHub Desktop.
Save JoshCheek/b685aaafcb8a0ab6619b9893f576a73a to your computer and use it in GitHub Desktop.
# The goal of this problem is to extract headers from a block of text,
# and arrange them hierarchically.
#
# See the specs for more detail on the output
require 'net/http'
require 'nokogiri'
def header_hierarchy(html)
# https://www.w3.org/MarkUp/html3/headings.html
Nokogiri::HTML(html).css("h1, h2, h3, h4, h5, h6").map do |h|
indentation = " " * h.name[/\d/].to_i.pred
"#{indentation}[#{h.name}] #{h.text}"
end
end
describe '#header_hierarchy' do
context 'EASY' do
it 'can extract a single header' do
expect(header_hierarchy("<h1>Foo</h1>")).to eq(['[h1] Foo'])
end
it 'can extract one nested level of header' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2>")
).to eq([
'[h1] Foo',
' [h2] Bar'
])
end
end
context 'MEDIUM' do
it 'can extract multiple levels of nested headers' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h4>Bam</h4>")
).to eq([
'[h1] Foo',
' [h2] Bar',
' [h3] Baz',
' [h4] Bam'
])
end
end
context 'HARD' do
it 'can extract multiple nested headers in multiple branches' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h2>Bam</h2><h3>Ba</h3>")
).to eq([
'[h1] Foo',
' [h2] Bar',
' [h3] Baz',
' [h2] Bam',
' [h3] Ba'
])
end
end
describe 'True parsing' do
it 'can parse an entire document' do
html = Net::HTTP.get(URI("https://jquery.com/"))
expect(header_hierarchy(html)).to eq([
" [h2] jQuery",
" [h3] Lightweight Footprint",
" [h3] CSS3 Compliant",
" [h3] Cross-Browser",
" [h2] What is jQuery?",
" [h2] Other Related Projects",
" [h3] Resources",
" [h2] A Brief Look",
" [h3] DOM Traversal and Manipulation",
" [h3] Event Handling",
" [h3] Ajax",
" [h3] Books"
])
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment