-
-
Save JoshCheek/b685aaafcb8a0ab6619b9893f576a73a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The goal of this problem is to extract headers from a block of text, | |
# and arrange them hierarchically. | |
# | |
# See the specs for more detail on the output | |
require 'net/http' | |
require 'nokogiri' | |
def header_hierarchy(html) | |
# https://www.w3.org/MarkUp/html3/headings.html | |
Nokogiri::HTML(html).css("h1, h2, h3, h4, h5, h6").map do |h| | |
indentation = " " * h.name[/\d/].to_i.pred | |
"#{indentation}[#{h.name}] #{h.text}" | |
end | |
end | |
describe '#header_hierarchy' do | |
context 'EASY' do | |
it 'can extract a single header' do | |
expect(header_hierarchy("<h1>Foo</h1>")).to eq(['[h1] Foo']) | |
end | |
it 'can extract one nested level of header' do | |
expect( | |
header_hierarchy("<h1>Foo</h1><h2>Bar</h2>") | |
).to eq([ | |
'[h1] Foo', | |
' [h2] Bar' | |
]) | |
end | |
end | |
context 'MEDIUM' do | |
it 'can extract multiple levels of nested headers' do | |
expect( | |
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h4>Bam</h4>") | |
).to eq([ | |
'[h1] Foo', | |
' [h2] Bar', | |
' [h3] Baz', | |
' [h4] Bam' | |
]) | |
end | |
end | |
context 'HARD' do | |
it 'can extract multiple nested headers in multiple branches' do | |
expect( | |
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h2>Bam</h2><h3>Ba</h3>") | |
).to eq([ | |
'[h1] Foo', | |
' [h2] Bar', | |
' [h3] Baz', | |
' [h2] Bam', | |
' [h3] Ba' | |
]) | |
end | |
end | |
describe 'True parsing' do | |
it 'can parse an entire document' do | |
html = Net::HTTP.get(URI("https://jquery.com/")) | |
expect(header_hierarchy(html)).to eq([ | |
" [h2] jQuery", | |
" [h3] Lightweight Footprint", | |
" [h3] CSS3 Compliant", | |
" [h3] Cross-Browser", | |
" [h2] What is jQuery?", | |
" [h2] Other Related Projects", | |
" [h3] Resources", | |
" [h2] A Brief Look", | |
" [h3] DOM Traversal and Manipulation", | |
" [h3] Event Handling", | |
" [h3] Ajax", | |
" [h3] Books" | |
]) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment