Skip to content

Instantly share code, notes, and snippets.

Forked from qtproduction/
Created January 4, 2013 15:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Adirael/4453525 to your computer and use it in GitHub Desktop.
Save Adirael/4453525 to your computer and use it in GitHub Desktop.
#Retrive old website from Google Cache. Optimized with sleep time, and avoid 504 error (Google block Ip send many request).
#Programmer: Kien Nguyen - QTPros
#change search_site and search_term to match your requirement
import urllib, urllib2
import re
import socket
import os, errno, os.path
import time
import random, math
#import MySQLdb
import imp;
#adjust the site here
search_term="site:" + search_site
#mysql = imp.load_source("MySQLConnector", "").MySQLConnector()
def mkdir_p(path):
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
else: raise
def main():
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv: Gecko/20070515 Firefox/'}
url = ""+search_term
regex_cache = re.compile(r'<a href="([^"]*)"[^>]*>Cached</a>')
regex_next = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span><span[^>]*>Next</span></a>')
regex_url = re.compile(r'search\?q=cache:[\d\w-]+:([^%]*)')
# regex_title = re.compile('<title>([\w\W]+)</title>')
# regex_time = re.compile('page as it appeared on ([\d\w\s:]+)')
regex_pagenum = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span>([\d]+)')
#this is the directory we will save files to
path = os.path.dirname(os.path.abspath(__file__)) + '\\' + search_site
# path = os.path.dirname(os.path.abspath(__file__))
counter = 0
pagenum = int(math.floor(len([name for name in os.listdir(path)]) / 10) + 1)
max_goto = 0;
more = True
if (pagenum > 1):
while (max_goto < pagenum):
req = urllib2.Request(url, None, headers)
page = urllib2.urlopen(req).read()
goto = regex_pagenum.findall(page)
# print goto
for goto_url, goto_pagenum in goto:
goto_pagenum = int(goto_pagenum)
if (goto_pagenum == pagenum):
url = "" + goto_url.replace('&amp;', '&')
max_goto = pagenum
elif (goto_pagenum < pagenum and max_goto < goto_pagenum):
max_goto = goto_pagenum
url = "" + goto_url.replace('&amp;', '&')
random_interval = random.randrange(5, 20, 1)
print "sleeping for: " + str(random_interval) + " seconds"
print "going to page: " + str(max_goto)
print url
#Send search request to google with pre-defined headers
req = urllib2.Request(url, None, headers)
#open the response page
page = urllib2.urlopen(req).read()
#find all cache in the page
matches = regex_cache.findall(page)
#loop through the matches
for match in matches:
#find the url of the page cached by google
the_url = regex_url.findall(match)
the_url = the_url[0]
the_url = the_url.replace('http://', '')
the_url = the_url.strip('/')
the_url = the_url.replace('/', '-')
#if href doesn't start with http insert http before
if not match.startswith("http"):
match = "http:" + match
if (not the_url.endswith('html')):
the_url = the_url + ".html"
#if filename "$url"[.html] does not exists
if not os.path.exists(search_site + "/" + the_url):
tmp_req = urllib2.Request(match.replace('&amp;', '&'), None, headers)
tmp_page = urllib2.urlopen(tmp_req).read()
f = open(search_site + "/" + the_url, 'w')
print counter, ": " + the_url
#comment out the code below if you expect to crawl less than 50 pages
random_interval = random.randrange(15, 20, 1)
print "sleeping for: " + str(random_interval) + " seconds"
except urllib2.HTTPError, e:
print 'Error code: ', e.code
#now check if there is more pages
match =
if match == None:
more = False
url = ""'&amp;', '&')
if __name__=="__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment