Skip to content

Instantly share code, notes, and snippets.

@baweaver
Created June 4, 2018 18:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save baweaver/3cbfd144b525ff32c1f013774a5bfac7 to your computer and use it in GitHub Desktop.
Save baweaver/3cbfd144b525ff32c1f013774a5bfac7 to your computer and use it in GitHub Desktop.
# The goal of this problem is to extract headers from a block of text,
# and arrange them hierarchically.
#
# See the specs for more detail on the output
def header_hierarchy(html)
raise "TODO"
end
describe '#header_hierarchy' do
context 'EASY' do
it 'can extract a single header' do
expect(header_hierarchy("<h1>Foo</h1>")).to eq(['[h1] Foo'])
end
it 'can extract one nested level of header' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2>")
).to eq([
'[h1] Foo',
' [h2] Bar'
])
end
end
context 'MEDIUM' do
it 'can extract multiple levels of nested headers' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h4>Bam</h4>")
).to eq([
'[h1] Foo',
' [h2] Bar',
' [h3] Baz',
' [h4] Bam'
])
end
end
context 'HARD' do
it 'can extract multiple nested headers in multiple branches' do
expect(
header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h2>Bam</h2><h3>Ba</h3>")
).to eq([
'[h1] Foo',
' [h2] Bar',
' [h3] Baz',
' [h2] Bam',
' [h3] Ba'
])
end
end
end
@baweaver
Copy link
Author

baweaver commented Jun 6, 2018

My solution:

EDIT - Modified tests to reflect the 'ROOT' element, going to leave the original specification alone for a bit.

require 'nokogiri'
require 'rspec/autorun'

class HeaderNode
  attr_reader :parent, :children

  def initialize(name, tag_name, parent = nil)
    @name     = name
    @tag_name = tag_name
    @children = []
    @parent   = parent
  end

  def descendant?(tag_name)
    @tag_name < tag_name
  end

  def add_child(node_name, tag_name)
    HeaderNode.new(node_name, tag_name, self).tap { |child| @children << child }
  end

  def to_h
    { 'name' => @name, 'children' => @children.map(&:to_h) }
  end

  def to_s(indent_level = 0)
    indent = ' ' * indent_level
    tag    = "#{indent}[#{@tag_name}] #{@name}\n"

    tag + @children.map { |c| c.to_s(indent_level + 2) }.join
  end
end

def header_extractor(html_partial, header_levels: %w(h1 h2 h3 h4 h5 h6))
  root_node = HeaderNode.new('ROOT', 'h0')
  
  Nokogiri("<html>#{html_partial}</html>")
    .css(header_levels.join(', '))
    .reduce(root_node) do |current_tree, tag|
      current_tree = current_tree.parent until current_tree.descendant?(tag.name)
      current_tree.add_child(tag.text, tag.name)
    end

  root_node
end

def header_hierarchy(html_partial)
  header_extractor(html_partial).to_s.split("\n")
end

describe '#header_hierarchy' do
  context 'EASY' do
    it 'can extract a single header' do
      expect(header_hierarchy("<h1>Foo</h1>")).to eq([
        "[h0] ROOT",
        "  [h1] Foo"
      ])
    end
    
    it 'can extract one nested level of header' do
      expect(
        header_hierarchy("<h1>Foo</h1><h2>Bar</h2>")
      ).to eq([
        "[h0] ROOT",
        "  [h1] Foo",
        "    [h2] Bar"
      ])
    end
  end
  
  context 'MEDIUM' do
    it 'can extract multiple levels of nested headers' do
      expect(
        header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h4>Bam</h4>")
      ).to eq([
        "[h0] ROOT",
        "  [h1] Foo",
        "    [h2] Bar",
        "      [h3] Baz",
        "        [h4] Bam"
      ])
    end
  end
  
  context 'HARD' do
    it 'can extract multiple nested headers in multiple branches' do
      expect(
        header_hierarchy("<h1>Foo</h1><h2>Bar</h2><h3>Baz</h3><h2>Bam</h2><h3>Ba</h3>")
      ).to eq([
        "[h0] ROOT",
        "  [h1] Foo",
        "    [h2] Bar",
        "      [h3] Baz",
        "    [h2] Bam",
        "      [h3] Ba"
      ])
    end
  end

  context 'LIVE' do
    it 'can parse an entire document' do
      require 'net/http'
      html = Net::HTTP.get(URI("https://jquery.com/"))

      expect(header_hierarchy(html)).to eq([
        "[h0] ROOT",
        "  [h2] jQuery",
        "    [h3] Lightweight Footprint",
        "    [h3] CSS3 Compliant",
        "    [h3] Cross-Browser",
        "  [h2] What is jQuery?",
        "  [h2] Other Related Projects",
        "    [h3] Resources",
        "  [h2] A Brief Look",
        "    [h3] DOM Traversal and Manipulation",
        "    [h3] Event Handling",
        "    [h3] Ajax",
        "    [h3] Books"
      ])
    end
  end
end

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment