Last active
August 29, 2015 14:27
-
-
Save yevhene/18521c128a2ef77ff218 to your computer and use it in GitHub Desktop.
Example crawler with cobweb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'optparse' | |
require 'ostruct' | |
Options = OpenStruct.new | |
Options.external = false | |
Options.errors = false | |
Options.dup = false | |
Options.verb = true | |
Options.color = true | |
OptionParser.new do |opts| | |
opts.banner = "Usage: run.sh root_url [options]" | |
opts.on("-x", "--[no-]external", "Show external") { |x| Options.external = x } | |
opts.on("-e", "--[no-]error", "Show error info") { |e| Options.errors = e } | |
opts.on("-d", "--[no-]dup", "Show duplicates") { |d| Options.dup = d } | |
opts.on("-v", "--[no-]verb", "Show verb") { |v| Options.verb = v } | |
opts.on("-c", "--[no-]color", "Show color") { |c| Options.color = c } | |
end.parse! | |
require 'bundler/inline' | |
gemfile(true) do | |
source 'https://rubygems.org' | |
gem 'cobweb' | |
gem 'colorize' | |
gem 'nokogiri' | |
gem 'activesupport' | |
gem 'virtus' | |
end | |
require 'active_support/core_ext/object/blank' | |
require 'active_support/core_ext/object/try' | |
String.disable_colorization = !Options.color | |
class Logger | |
def self.log(verb, message, color) | |
print "#{verb} ".rjust(12).colorize(color) if Options.verb | |
puts message | |
end | |
end | |
class Endpoint | |
include Virtus.model | |
attribute :method, String, default: 'GET' | |
attribute :url, String | |
def color | |
case method | |
when 'GET' then :light_green | |
when 'POST' then :light_blue | |
when 'PUT', 'PATCH' then :light_yellow | |
when 'DELETE' then :light_red | |
else :light_white | |
end | |
end | |
def ==(other) | |
other.method == method && other.url == url | |
end | |
def log | |
Logger.log(method, url, color) | |
end | |
end | |
class Site | |
include Virtus.model | |
attribute :root, URI | |
attribute :endpoints, Array, default: [] | |
def initialize(root) | |
uri = URI(root) | |
uri.path = '/' if uri.path.blank? | |
super(root: uri) | |
add_endpoint uri.path | |
end | |
def crawl | |
CobwebCrawler.new(cache: 600).crawl(root.to_s) do |page| | |
process_page(page) if page[:mime_type] == 'text/html' | |
end | |
end | |
private | |
def normalize(url, parent = nil) | |
return unless url | |
uri = URI(url) | |
unless uri.host.nil? || uri.host == root.host | |
Logger.log('EXTERNAL', url, :light_red) if Options.external | |
return | |
end | |
normalize_path(uri.path, parent) | |
rescue StandardError => e | |
Logger.log('WARNING', "#{url} #{e.message}", :yellow) if Options.errors | |
nil | |
end | |
def normalize_path(path, parent_path = nil) | |
URI.join(*([root, parent_path, path].compact)).path | |
end | |
def process_page(page) | |
if add_endpoint(normalize(page[:url])) | |
process_forms page | |
end | |
rescue StandardError => e | |
Logger.log('ERROR', "#{page[:url]} #{e.message}", :red) if Options.errors | |
end | |
def process_forms(page) | |
doc = Nokogiri::HTML(page[:body]) | |
doc.css('form').each do |form| | |
action = form.attributes['action'].try(:value) | |
method = form.attributes['method'].try(:value).try(:upcase) || 'POST' | |
add_endpoint(normalize(action, page[:url]), method) | |
end | |
end | |
def add_endpoint(url, method = 'GET') | |
return unless url.present? | |
endpoint = Endpoint.new(url: url, method: method) | |
if endpoints.include? endpoint | |
Logger.log("DUP #{method}", url, :light_white) if Options.errors | |
return | |
end | |
endpoints << endpoint | |
endpoint.log | |
return true | |
end | |
end | |
site = Site.new(ARGV.first) | |
site.crawl |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment