Skip to content

Instantly share code, notes, and snippets.

@jcf
Created December 10, 2008 22:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jcf/34525 to your computer and use it in GitHub Desktop.
Save jcf/34525 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby -wKU
require "rubygems"
require "hpricot"
class Crawler
def initialize(site_uri='theonlyjames.com',
wget_path='/usr/local/bin/wget')
@site_uri = site_uri
@wget_path = wget_path
sanity_check
end
def sanity_check
if @wget_path == '' && %x(which wget).length == 0
raise "wget not in $PATH. Please specify path to wget manually.\n\nCrawler.new('site_uri', '/path/to/wget')"
end
end
def wget_sites(site_uris=[])
site_uris.each { |s| wget(s) }
end
def wget(s=@site_uri)
$stdout.puts("Running #{@wget_path} -rkpE '#{s}'")
%x(#{@wget_path} -rk '#{s}')
fix_php_css
end
def fix_php_css
html_files = Array.new
Dir.glob("./#{@site_uri}/*").each do |f|
html_files += [f] if f =~ /.html/
end
html_files.each do |f|
puts "Parsing #{File.basename(f)}"
doc = Hpricot(File.read(f))
# (doc/'/html/head/link[@rel=stylesheet]').each do |link|
# puts "\t#{link['href'].gsub('.php', '_php').gsub('?', '_').gsub('&', '-')}"
# end
(doc/'/html/head/link[@rel=stylesheet]').each do |link|
link.set("href", "new_href.css")}
end
end
end
end
c = Crawler.new()
c.fix_php_css
# c.wget
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment