Skip to content

Instantly share code, notes, and snippets.

@yevhene
Last active August 29, 2015 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yevhene/18521c128a2ef77ff218 to your computer and use it in GitHub Desktop.
Save yevhene/18521c128a2ef77ff218 to your computer and use it in GitHub Desktop.
Example crawler with cobweb
#!/usr/bin/env ruby
require 'optparse'
require 'ostruct'
Options = OpenStruct.new
Options.external = false
Options.errors = false
Options.dup = false
Options.verb = true
Options.color = true
OptionParser.new do |opts|
opts.banner = "Usage: run.sh root_url [options]"
opts.on("-x", "--[no-]external", "Show external") { |x| Options.external = x }
opts.on("-e", "--[no-]error", "Show error info") { |e| Options.errors = e }
opts.on("-d", "--[no-]dup", "Show duplicates") { |d| Options.dup = d }
opts.on("-v", "--[no-]verb", "Show verb") { |v| Options.verb = v }
opts.on("-c", "--[no-]color", "Show color") { |c| Options.color = c }
end.parse!
require 'bundler/inline'
gemfile(true) do
source 'https://rubygems.org'
gem 'cobweb'
gem 'colorize'
gem 'nokogiri'
gem 'activesupport'
gem 'virtus'
end
require 'active_support/core_ext/object/blank'
require 'active_support/core_ext/object/try'
String.disable_colorization = !Options.color
class Logger
def self.log(verb, message, color)
print "#{verb} ".rjust(12).colorize(color) if Options.verb
puts message
end
end
class Endpoint
include Virtus.model
attribute :method, String, default: 'GET'
attribute :url, String
def color
case method
when 'GET' then :light_green
when 'POST' then :light_blue
when 'PUT', 'PATCH' then :light_yellow
when 'DELETE' then :light_red
else :light_white
end
end
def ==(other)
other.method == method && other.url == url
end
def log
Logger.log(method, url, color)
end
end
class Site
include Virtus.model
attribute :root, URI
attribute :endpoints, Array, default: []
def initialize(root)
uri = URI(root)
uri.path = '/' if uri.path.blank?
super(root: uri)
add_endpoint uri.path
end
def crawl
CobwebCrawler.new(cache: 600).crawl(root.to_s) do |page|
process_page(page) if page[:mime_type] == 'text/html'
end
end
private
def normalize(url, parent = nil)
return unless url
uri = URI(url)
unless uri.host.nil? || uri.host == root.host
Logger.log('EXTERNAL', url, :light_red) if Options.external
return
end
normalize_path(uri.path, parent)
rescue StandardError => e
Logger.log('WARNING', "#{url} #{e.message}", :yellow) if Options.errors
nil
end
def normalize_path(path, parent_path = nil)
URI.join(*([root, parent_path, path].compact)).path
end
def process_page(page)
if add_endpoint(normalize(page[:url]))
process_forms page
end
rescue StandardError => e
Logger.log('ERROR', "#{page[:url]} #{e.message}", :red) if Options.errors
end
def process_forms(page)
doc = Nokogiri::HTML(page[:body])
doc.css('form').each do |form|
action = form.attributes['action'].try(:value)
method = form.attributes['method'].try(:value).try(:upcase) || 'POST'
add_endpoint(normalize(action, page[:url]), method)
end
end
def add_endpoint(url, method = 'GET')
return unless url.present?
endpoint = Endpoint.new(url: url, method: method)
if endpoints.include? endpoint
Logger.log("DUP #{method}", url, :light_white) if Options.errors
return
end
endpoints << endpoint
endpoint.log
return true
end
end
site = Site.new(ARGV.first)
site.crawl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment