This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Retrive old website from Google Cache. Optimized with sleep time, and avoid 504 error (Google block Ip send many request). | |
#Programmer: Kien Nguyen - QTPros http://qtpros.info/kiennguyen | |
#change search_site and search_term to match your requirement | |
#Original: http://www.guyrutenberg.com/2008/10/02/retrieving-googles-cache-for-a-whole-website/ | |
#!/usr/bin/python | |
import urllib, urllib2 | |
import re | |
import socket | |
import os, errno, os.path | |
import time | |
import random, math | |
#import MySQLdb | |
import imp; | |
socket.setdefaulttimeout(30) | |
#adjust the site here | |
search_site="qtpros.info" | |
search_term="site:" + search_site | |
#mysql = imp.load_source("MySQLConnector", "mysql.py").MySQLConnector() | |
#mysql.connect('localhost','root','','webscrape',True) | |
def mkdir_p(path): | |
try: | |
os.makedirs(path) | |
except OSError as exc: # Python >2.5 | |
if exc.errno == errno.EEXIST: | |
pass | |
else: raise | |
def main(): | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4'} | |
url = "http://www.google.com/search?q="+search_term | |
regex_cache = re.compile(r'<a href="([^"]*)"[^>]*>Cached</a>') | |
regex_next = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span><span[^>]*>Next</span></a>') | |
regex_url = re.compile(r'search\?q=cache:[\d\w-]+:([^%]*)') | |
# regex_title = re.compile('<title>([\w\W]+)</title>') | |
# regex_time = re.compile('page as it appeared on ([\d\w\s:]+)') | |
regex_pagenum = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span>([\d]+)') | |
#this is the directory we will save files to | |
mkdir_p(search_site) | |
path = os.path.dirname(os.path.abspath(__file__)) + '\\' + search_site | |
# path = os.path.dirname(os.path.abspath(__file__)) | |
counter = 0 | |
pagenum = int(math.floor(len([name for name in os.listdir(path)]) / 10) + 1) | |
max_goto = 0; | |
more = True | |
if (pagenum > 1): | |
while (max_goto < pagenum): | |
req = urllib2.Request(url, None, headers) | |
page = urllib2.urlopen(req).read() | |
goto = regex_pagenum.findall(page) | |
# print goto | |
for goto_url, goto_pagenum in goto: | |
goto_pagenum = int(goto_pagenum) | |
if (goto_pagenum == pagenum): | |
url = "http://www.google.com" + goto_url.replace('&', '&') | |
max_goto = pagenum | |
break | |
elif (goto_pagenum < pagenum and max_goto < goto_pagenum): | |
max_goto = goto_pagenum | |
url = "http://www.google.com" + goto_url.replace('&', '&') | |
random_interval = random.randrange(5, 20, 1) | |
print "sleeping for: " + str(random_interval) + " seconds" | |
print "going to page: " + str(max_goto) | |
print url | |
time.sleep(random_interval) | |
while(more): | |
#Send search request to google with pre-defined headers | |
req = urllib2.Request(url, None, headers) | |
#open the response page | |
page = urllib2.urlopen(req).read() | |
#find all cache in the page | |
matches = regex_cache.findall(page) | |
#loop through the matches | |
for match in matches: | |
counter+=1 | |
#find the url of the page cached by google | |
the_url = regex_url.findall(match) | |
the_url = the_url[0] | |
the_url = the_url.replace('http://', '') | |
the_url = the_url.strip('/') | |
the_url = the_url.replace('/', '-') | |
#if href doesn't start with http insert http before | |
if not match.startswith("http"): | |
match = "http:" + match | |
if (not the_url.endswith('html')): | |
the_url = the_url + ".html" | |
#if filename "$url"[.html] does not exists | |
if not os.path.exists(search_site + "/" + the_url): | |
tmp_req = urllib2.Request(match.replace('&', '&'), None, headers) | |
try: | |
tmp_page = urllib2.urlopen(tmp_req).read() | |
f = open(search_site + "/" + the_url, 'w') | |
f.write(tmp_page) | |
f.close() | |
print counter, ": " + the_url | |
#comment out the code below if you expect to crawl less than 50 pages | |
random_interval = random.randrange(15, 20, 1) | |
print "sleeping for: " + str(random_interval) + " seconds" | |
time.sleep(random_interval) | |
except urllib2.HTTPError, e: | |
print 'Error code: ', e.code | |
pass | |
#now check if there is more pages | |
match = regex_next.search(page) | |
if match == None: | |
more = False | |
else: | |
url = "http://www.google.com"+match.group(1).replace('&', '&') | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment