egardner/README.md

## README.md

      
    Raw
  

              README.md
            
          
    HTML Unroller

This script is a basic "proof-of-concept" implementation of a parsing tool that
can "unroll" or flatten an HTML document into a simple JSON representation. Inspired
by the Prosemirror document model.
There are two things which allow for the simplified output:

We don't care about every HTML element, just a limited "whitelist"
Text-level elements are represented as a linear sequence (in a real-world version,
each text item would need some kind of attributes or properties field to indicate
links, images, bold + italic, superscript/subscript, etc.

Dependencies


Ruby (any recent version)
Nokogiri gem

Usage

To see some output, run this script with a path to an HTML document as an argument:
ruby parser.rb "book/OEBPS/Badi_9781781680308_epub_c04_r1.htm"

This will give you a giant string of JSON which can be pasted into a tool like this one
for inspection. Or load the script into a pry session to peek under the hood.

  
## parser.rb
require "nokogiri"
require "json"
require "securerandom"

module Parser
  BLOCK_TYPES = [
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'p',
    'blockquote',
    'ul',
    'ol'
  ]

  class Chapter
    attr_reader :sequence

    def initialize(doc)
      @sequence = []
      parse_block doc.html.body
    end

    def to_h
      { sequence: sequence.map { |i| i.to_h } }
    end

    private
    attr_writer :sequence

    def parse_block(node)
      if BLOCK_TYPES.include?(node.name)
        sequence << Block.new(node)
      else
        node.element_children.each { |e| parse_block(e) }
      end
    end
  end

  class Block
    attr_reader :tag, :sequence, :id

    def initialize(node)
      @tag = node.name
      @sequence = []
      @id = SecureRandom.uuid
      parse_children node
    end

    def to_h
      {
        id: id,
        tag: tag,
        sequence: sequence.map { |i| i.to_h }
      }
    end

    private
    attr_writer :sequence

    def parse_children(node)
      node.children.each do |child|
        case child.type
        when 1
        # element node
          if BLOCK_TYPES.include?(child.name)
            sequence << Block.new(child)
          else
            sequence << Text.new(child.text, child.name) unless child.text.empty?
          end
        when 2
          # attr node
        when 3
          # text node
          sequence << Text.new(child.text) unless child.text.empty?
        end
      end
    end
  end

  class Text
    attr_reader :text, :tag, :id

    def initialize(text, tag_name="text")
      @text = text
      @tag = tag_name
      @id = SecureRandom.uuid
    end

    def to_h
      {
        id: id,
        tag: tag,
        text: text
      }
    end
  end

  class CLI
    attr_reader :doc

    class << self
      def parse(file_path)
        doc = File.open(file_path) { |f| Nokogiri::Slop(f) }
        chapter = Chapter.new(doc)
        puts chapter.to_h
      end
    end
  end
end

Parser::CLI.parse(ARGV[0])
	require "nokogiri"
	require "json"
	require "securerandom"

	module Parser
	BLOCK_TYPES = [
	'h1',
	'h2',
	'h3',
	'h4',
	'h5',
	'h6',
	'p',
	'blockquote',
	'ul',
	'ol'
	]

	class Chapter
	attr_reader :sequence

	def initialize(doc)
	@sequence = []
	parse_block doc.html.body
	end

	def to_h
	{ sequence: sequence.map { \|i\| i.to_h } }
	end

	private
	attr_writer :sequence

	def parse_block(node)
	if BLOCK_TYPES.include?(node.name)
	sequence << Block.new(node)
	else
	node.element_children.each { \|e\| parse_block(e) }
	end
	end
	end

	class Block
	attr_reader :tag, :sequence, :id

	def initialize(node)
	@tag = node.name
	@sequence = []
	@id = SecureRandom.uuid
	parse_children node
	end

	def to_h
	{
	id: id,
	tag: tag,
	sequence: sequence.map { \|i\| i.to_h }
	}
	end

	private
	attr_writer :sequence

	def parse_children(node)
	node.children.each do \|child\|
	case child.type
	when 1
	# element node
	if BLOCK_TYPES.include?(child.name)
	sequence << Block.new(child)
	else
	sequence << Text.new(child.text, child.name) unless child.text.empty?
	end
	when 2
	# attr node
	when 3
	# text node
	sequence << Text.new(child.text) unless child.text.empty?
	end
	end
	end
	end

	class Text
	attr_reader :text, :tag, :id

	def initialize(text, tag_name="text")
	@text = text
	@tag = tag_name
	@id = SecureRandom.uuid
	end

	def to_h
	{
	id: id,
	tag: tag,
	text: text
	}
	end
	end

	class CLI
	attr_reader :doc

	class << self
	def parse(file_path)
	doc = File.open(file_path) { \|f\| Nokogiri::Slop(f) }
	chapter = Chapter.new(doc)
	puts chapter.to_h
	end
	end
	end
	end

	Parser::CLI.parse(ARGV[0])