Skip to content

Instantly share code, notes, and snippets.

@rob-mcgrail
Created January 11, 2012 22:51
Show Gist options
  • Save rob-mcgrail/1597271 to your computer and use it in GitHub Desktop.
Save rob-mcgrail/1597271 to your computer and use it in GitHub Desktop.
Site Upgrade Comparisonator
require 'rubygems'
require 'anemone'
require 'redis'
require 'trollop'
require 'highline'
$term = HighLine.new
opts = Trollop::options do
opt :redis, "Select redis store", :default => 1
opt :compare, "Enter compare mode", :default => nil
opt :flush, "Flush out data for a site", :default => nil
opt :output, "Path for output report file", :default => "report-#{Time.now.strftime("%d-%m-%y-%s")}.txt"
opt :list, "List sites stored in theb database", :default => nil
opt :transform, "Specify a site whos keyset needs to be transformed..."
end
#Trollop::die "I need a full path from which to start my crawl - like http://site.com" if ARGV.empty?
$redis = Redis.new
$redis.select opts[:redis].to_i
@filename = opts[:output]; @report = ""
site = ARGV[0]
transform_site = ARGV[1]
ext = %w(flv swf zip rar tar 7z gz jar js raw mp3 mp4 wav wmv ape aac ac3 wma aiff mpg mpeg avi mov ogg mkv mka asx asf mp2 m1v m3u f4v pdf doc xls ppt pps bin exe)
class String # put any desired page source normalizations as string#norm.
def norm
s = self.to_a[445..-1].join #trim first 445 lines to exclude common variable element
s.gsub(/\s+/, "") # kill whitespace
end
end
$redis.del "pageset:current:#{site}"
$redis.del "pageset:new:#{site}"
$redis.del "pageset:changed:#{site}"
if opts[:list]
puts "Listing sites with stored data:"
crawls = $redis.smembers "crawls"
crawls.each {|i| puts i}
end
if opts[:transform]
puts "Change all data for #{site} to #{transform_site}? Y/N"
if $stdin.gets.chomp.upcase == 'Y'
$redis.sadd "crawls", site
keys = $redis.smembers "pageset:old:#{site}"
keys.each do |k|
$redis.rename k, k.gsub(site, transform_site)
puts $term.color("#{k} => #{k.gsub(site, transform_site)}", :red)
s = $redis.get k.gsub(site, transform_site)
$redis.set k.gsub(site, transform_site), s.gsub(site, transform_site).gsub('preprod.', '') # add any source transforms here
puts $term.color("Changing url references to #{transform_site}", :red)
end
$redis.rename "pageset:old:#{site}", "pageset:old:#{transform_site}"
keys.each do |k|
$redis.srem "pageset:old:#{transform_site}", k
$redis.sadd "pageset:old:#{transform_site}", k.gsub(site, transform_site)
end
$redis.sadd "crawls", transform_site
$redis.srem "crawls", site
end
end
if opts[:flush]
puts "Flush all cached data for #{site}? Y/N"
if $stdin.gets.chomp.upcase == 'Y'
keys = $redis.smembers "pageset:old:#{site}"
keys.each do |key|
puts $term.color("Deleting #{key}", :red)
$redis.del key
end
puts $term.color("Deleting key list for #{site}...", :red)
$redis.del "pageset:old:#{site}"
$redis.del "pageset:current:#{site}"
$redis.del "pageset:new:#{site}"
$redis.del "pageset:changed:#{site}"
$redis.srem "crawls", site
end
end
def cachecheck(cached, new, site)
if cached
if cached == new.body.norm
puts $term.color('Identical', :green)
else
puts $term.color("Changed", :red)
$redis.sadd "pageset:changed:#{site}", new.url
end
else
puts $term.color("Not found", :red)
$redis.sadd "pageset:new:#{site}", new.url
end
end
def missingpages(site)
cachekeys = $redis.scard "pageset:old:#{site}"
newkeys = $redis.scard "pageset:current:#{site}"
if cachekeys > newkeys
rputs $term.color("The following links were found last time, but not this time:", :red)
diff = $redis.sdiff "pageset:old:#{site}", "pageset:current:#{site}"
diff.each do |url|
rputs url
end
puts $term.color("Check the pages above, or see report #{@filename}", :red)
else
puts $term.color('All pages found previously were checked', :green)
$redis.del "pageset:current:#{site}"
end
end
def changedpages(site)
changed = $redis.scard "pageset:changed:#{site}"
if changed > 0
rputs $term.color("The following pages were found and have changed:", :red)
links = $redis.smembers "pageset:changed:#{site}"
links.each do |url|
rputs url
end
puts $term.color("Check the pages above, or see report #{@filename}", :red)
else
puts $term.color('No found pages have changed', :green)
$redis.del "pageset:changed:#{site}"
end
end
def newpages(site)
new_ = $redis.scard "pageset:new:#{site}"
if new_ > 0
rputs $term.color("The following pages found this time didn't exist last time:", :red)
links = $redis.smembers "pageset:new:#{site}"
links.each do |url|
rputs url
end
puts $term.color("Check the pages above, or see report #{@filename}", :red)
else
puts $term.color('No new pages were found.', :green)
$redis.del "pageset:new:#{site}"
end
end
def rputs(arg='')
puts arg
@report << arg + "\n"
end
unless opts[:flush] || opts[:list] || opts[:transform] || site == nil
$redis.sadd "crawls", site
Anemone.crawl(site, :delay => 1) do |anemone|
anemone.skip_links_like /\.#{ext.join('|')}$/
anemone.on_every_page do |page|
if opts[:compare]
puts "Looking for pre-stored #{page.url}"
$redis.sadd "pageset:current:#{site}", page.url
cached = $redis.get page.url
cachecheck(cached, page, site)
else
puts "Storing #{page.url}"
$redis.sadd "pageset:old:#{site}", page.url
$redis.set page.url, page.body.norm
end
end
end
if opts[:compare]
puts; puts $term.color("Results:", :blue); puts
missingpages(site)
newpages(site)
changedpages(site)
if @report != ""
puts; puts $term.color("Writing output file #{@filename}", :blue); puts
File.open(@filename, 'w+') {|f| f.write(@report) }
end
end
print "Tidying up"
$redis.del "pageset:new:#{site}"
print '.'
$redis.del "pageset:changed:#{site}"
print '.'
$redis.del "pageset:current:#{site}"
puts '.'
end
puts $term.color("Complete", :yellow)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment