Skip to content

Instantly share code, notes, and snippets.

@stilist
Created April 30, 2009 23:48
Show Gist options
  • Save stilist/104765 to your computer and use it in GitHub Desktop.
Save stilist/104765 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby -w
# This script checks if a given GeoCities page has been saved by the Archive
# Team. Its goal is to reduce duplication of effort by only grabbing new pages.
#
# Input: plaintext list of URLs to check
# Output: plaintext list of URLs not in the cache
#
# MIT license, stilist, 02009
require 'cgi'
require 'socket'
require 'timeout'
unless ARGV[0] && ARGV[1]
puts 'Usage: listvscache inputfile outputfile'
exit(-1)
end
unless File.exist?(ARGV[0]) && File.ftype(ARGV[0]) == 'file'
puts 'Please specify a valid input file'
exit(-1)
end
$inputFile = File.open(ARGV[0], 'r')
$outputFile = File.new(ARGV[1], 'w')
$baseURL = 'www.geneb.org'
$extURL = '/cgi-bin/at_dupecheck.cgi?sitemode=geocities&url='
def dupeCheck(address)
retries = 5
begin
address.gsub!('http://', '')
result = Timeout::timeout(1) {
s = TCPSocket.open($baseURL, 80)
s.write("GET #{$extURL}./#{address} HTTP/1.1\r\nHost: #{$baseURL}\r\n\r\n")
response = s.read[-1,1].to_sym # only need a single character
s.close
response
}
case result
when :N
puts "#{address} is not cached"
$outputFile.puts address
when :Y
puts "#{address} is cached"
when :X
puts "#{address} is invalid"
else
puts "Bad URL: #{address}"
end
rescue Timeout::Error
retries -= 1
if retries > 0
puts "Will retry #{address} (#{retries} attempts remain)"
sleep 0.001 and retry
else
puts "ERROR: Giving up on #{address}"
return true
end
end
end
def listVsCache
$inputFile.each_line do |address|
# clean things up a little
address = CGI.unescape(address)
address.gsub!(/(\s+|\.\.\/)/, '') # space or ../
uncached = dupeCheck(address)
# sleep 0.001 # rate-limit to 1000/sec
end
$inputFile.close
$outputFile.close
end
listVsCache
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment