Skip to content

Instantly share code, notes, and snippets.

@swombat
Created June 1, 2016 21:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save swombat/932dec51c8b7704365abbebc405f8fd8 to your computer and use it in GitHub Desktop.
Save swombat/932dec51c8b7704365abbebc405f8fd8 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'nokogiri'
require 'open-uri'
class Harvester
attr_accessor :output, :links_fetched
def initialize(args)
super()
@options = {
:content_selector => "",
:content_filter => "",
:link_selector => {},
:max_depth => 1,
:url_root => "",
:filter => /.*/
}.merge(args)
@output = []
@links_fetched = []
@depth = 0
end
def fetch(filename)
if @options[:filter].match(filename)
puts "Fetching #{filename}"
parse_page(get("#{@options[:url_root]}#{filename}"))
else
puts "Skipping #{filename}"
end
end
private
def parse_page(doc)
@depth += 1
# collect contents
@options[:content_filter].each do |filter|
doc.css(filter).each { |node| node.remove }
end
content = doc.css(@options[:content_selector])
@output << depthize(content.inner_html, @depth)
# collect links
doc.css(@options[:link_selector]).each do |link|
fetch(link.attr("href"))
end
@depth -= 1
end
def get(url)
Nokogiri::HTML(open(url))
end
def depthize(content, depth)
(depth - 2).times { content = shift_headers(content) }
content
end
def shift_headers(content)
content.gsub("h7", "p").gsub("h6", "h7").gsub("h5", "h6").gsub("h4", "h5").gsub("h3", "h4").gsub("h2", "h3").gsub("h1", "h2")
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment