Created
January 11, 2012 22:51
-
-
Save rob-mcgrail/1597271 to your computer and use it in GitHub Desktop.
Site Upgrade Comparisonator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'anemone' | |
require 'redis' | |
require 'trollop' | |
require 'highline' | |
$term = HighLine.new | |
opts = Trollop::options do | |
opt :redis, "Select redis store", :default => 1 | |
opt :compare, "Enter compare mode", :default => nil | |
opt :flush, "Flush out data for a site", :default => nil | |
opt :output, "Path for output report file", :default => "report-#{Time.now.strftime("%d-%m-%y-%s")}.txt" | |
opt :list, "List sites stored in theb database", :default => nil | |
opt :transform, "Specify a site whos keyset needs to be transformed..." | |
end | |
#Trollop::die "I need a full path from which to start my crawl - like http://site.com" if ARGV.empty? | |
$redis = Redis.new | |
$redis.select opts[:redis].to_i | |
@filename = opts[:output]; @report = "" | |
site = ARGV[0] | |
transform_site = ARGV[1] | |
ext = %w(flv swf zip rar tar 7z gz jar js raw mp3 mp4 wav wmv ape aac ac3 wma aiff mpg mpeg avi mov ogg mkv mka asx asf mp2 m1v m3u f4v pdf doc xls ppt pps bin exe) | |
class String # put any desired page source normalizations as string#norm. | |
def norm | |
s = self.to_a[445..-1].join #trim first 445 lines to exclude common variable element | |
s.gsub(/\s+/, "") # kill whitespace | |
end | |
end | |
$redis.del "pageset:current:#{site}" | |
$redis.del "pageset:new:#{site}" | |
$redis.del "pageset:changed:#{site}" | |
if opts[:list] | |
puts "Listing sites with stored data:" | |
crawls = $redis.smembers "crawls" | |
crawls.each {|i| puts i} | |
end | |
if opts[:transform] | |
puts "Change all data for #{site} to #{transform_site}? Y/N" | |
if $stdin.gets.chomp.upcase == 'Y' | |
$redis.sadd "crawls", site | |
keys = $redis.smembers "pageset:old:#{site}" | |
keys.each do |k| | |
$redis.rename k, k.gsub(site, transform_site) | |
puts $term.color("#{k} => #{k.gsub(site, transform_site)}", :red) | |
s = $redis.get k.gsub(site, transform_site) | |
$redis.set k.gsub(site, transform_site), s.gsub(site, transform_site).gsub('preprod.', '') # add any source transforms here | |
puts $term.color("Changing url references to #{transform_site}", :red) | |
end | |
$redis.rename "pageset:old:#{site}", "pageset:old:#{transform_site}" | |
keys.each do |k| | |
$redis.srem "pageset:old:#{transform_site}", k | |
$redis.sadd "pageset:old:#{transform_site}", k.gsub(site, transform_site) | |
end | |
$redis.sadd "crawls", transform_site | |
$redis.srem "crawls", site | |
end | |
end | |
if opts[:flush] | |
puts "Flush all cached data for #{site}? Y/N" | |
if $stdin.gets.chomp.upcase == 'Y' | |
keys = $redis.smembers "pageset:old:#{site}" | |
keys.each do |key| | |
puts $term.color("Deleting #{key}", :red) | |
$redis.del key | |
end | |
puts $term.color("Deleting key list for #{site}...", :red) | |
$redis.del "pageset:old:#{site}" | |
$redis.del "pageset:current:#{site}" | |
$redis.del "pageset:new:#{site}" | |
$redis.del "pageset:changed:#{site}" | |
$redis.srem "crawls", site | |
end | |
end | |
def cachecheck(cached, new, site) | |
if cached | |
if cached == new.body.norm | |
puts $term.color('Identical', :green) | |
else | |
puts $term.color("Changed", :red) | |
$redis.sadd "pageset:changed:#{site}", new.url | |
end | |
else | |
puts $term.color("Not found", :red) | |
$redis.sadd "pageset:new:#{site}", new.url | |
end | |
end | |
def missingpages(site) | |
cachekeys = $redis.scard "pageset:old:#{site}" | |
newkeys = $redis.scard "pageset:current:#{site}" | |
if cachekeys > newkeys | |
rputs $term.color("The following links were found last time, but not this time:", :red) | |
diff = $redis.sdiff "pageset:old:#{site}", "pageset:current:#{site}" | |
diff.each do |url| | |
rputs url | |
end | |
puts $term.color("Check the pages above, or see report #{@filename}", :red) | |
else | |
puts $term.color('All pages found previously were checked', :green) | |
$redis.del "pageset:current:#{site}" | |
end | |
end | |
def changedpages(site) | |
changed = $redis.scard "pageset:changed:#{site}" | |
if changed > 0 | |
rputs $term.color("The following pages were found and have changed:", :red) | |
links = $redis.smembers "pageset:changed:#{site}" | |
links.each do |url| | |
rputs url | |
end | |
puts $term.color("Check the pages above, or see report #{@filename}", :red) | |
else | |
puts $term.color('No found pages have changed', :green) | |
$redis.del "pageset:changed:#{site}" | |
end | |
end | |
def newpages(site) | |
new_ = $redis.scard "pageset:new:#{site}" | |
if new_ > 0 | |
rputs $term.color("The following pages found this time didn't exist last time:", :red) | |
links = $redis.smembers "pageset:new:#{site}" | |
links.each do |url| | |
rputs url | |
end | |
puts $term.color("Check the pages above, or see report #{@filename}", :red) | |
else | |
puts $term.color('No new pages were found.', :green) | |
$redis.del "pageset:new:#{site}" | |
end | |
end | |
def rputs(arg='') | |
puts arg | |
@report << arg + "\n" | |
end | |
unless opts[:flush] || opts[:list] || opts[:transform] || site == nil | |
$redis.sadd "crawls", site | |
Anemone.crawl(site, :delay => 1) do |anemone| | |
anemone.skip_links_like /\.#{ext.join('|')}$/ | |
anemone.on_every_page do |page| | |
if opts[:compare] | |
puts "Looking for pre-stored #{page.url}" | |
$redis.sadd "pageset:current:#{site}", page.url | |
cached = $redis.get page.url | |
cachecheck(cached, page, site) | |
else | |
puts "Storing #{page.url}" | |
$redis.sadd "pageset:old:#{site}", page.url | |
$redis.set page.url, page.body.norm | |
end | |
end | |
end | |
if opts[:compare] | |
puts; puts $term.color("Results:", :blue); puts | |
missingpages(site) | |
newpages(site) | |
changedpages(site) | |
if @report != "" | |
puts; puts $term.color("Writing output file #{@filename}", :blue); puts | |
File.open(@filename, 'w+') {|f| f.write(@report) } | |
end | |
end | |
print "Tidying up" | |
$redis.del "pageset:new:#{site}" | |
print '.' | |
$redis.del "pageset:changed:#{site}" | |
print '.' | |
$redis.del "pageset:current:#{site}" | |
puts '.' | |
end | |
puts $term.color("Complete", :yellow) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment