public
Last active

Retrive website from Google Cache without blocking IP

  • Download Gist
scrape.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
#Retrive old website from Google Cache. Optimized with sleep time, and avoid 504 error (Google block Ip send many request).
#Programmer: Kien Nguyen - QTPros http://qtpros.info/kiennguyen
#change search_site and search_term to match your requirement
#Original: http://www.guyrutenberg.com/2008/10/02/retrieving-googles-cache-for-a-whole-website/
 
 
#!/usr/bin/python
import urllib, urllib2
import re
import socket
import os, errno, os.path
import time
import random, math
#import MySQLdb
import imp;
 
socket.setdefaulttimeout(30)
#adjust the site here
search_site="qtpros.info"
search_term="site:" + search_site
 
#mysql = imp.load_source("MySQLConnector", "mysql.py").MySQLConnector()
#mysql.connect('localhost','root','','webscrape',True)
 
 
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
pass
else: raise
 
 
def main():
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4'}
url = "http://www.google.com/search?q="+search_term
 
regex_cache = re.compile(r'<a href="([^"]*)"[^>]*>Cached</a>')
regex_next = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span><span[^>]*>Next</span></a>')
regex_url = re.compile(r'search\?q=cache:[\d\w-]+:([^%]*)')
# regex_title = re.compile('<title>([\w\W]+)</title>')
# regex_time = re.compile('page as it appeared on ([\d\w\s:]+)')
regex_pagenum = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span>([\d]+)')
 
#this is the directory we will save files to
mkdir_p(search_site)
path = os.path.dirname(os.path.abspath(__file__)) + '\\' + search_site
# path = os.path.dirname(os.path.abspath(__file__))
counter = 0
pagenum = int(math.floor(len([name for name in os.listdir(path)]) / 10) + 1)
max_goto = 0;
more = True
if (pagenum > 1):
while (max_goto < pagenum):
req = urllib2.Request(url, None, headers)
page = urllib2.urlopen(req).read()
goto = regex_pagenum.findall(page)
# print goto
for goto_url, goto_pagenum in goto:
goto_pagenum = int(goto_pagenum)
if (goto_pagenum == pagenum):
url = "http://www.google.com" + goto_url.replace('&amp;', '&')
max_goto = pagenum
break
elif (goto_pagenum < pagenum and max_goto < goto_pagenum):
max_goto = goto_pagenum
url = "http://www.google.com" + goto_url.replace('&amp;', '&')
random_interval = random.randrange(5, 20, 1)
print "sleeping for: " + str(random_interval) + " seconds"
print "going to page: " + str(max_goto)
print url
time.sleep(random_interval)
 
 
while(more):
#Send search request to google with pre-defined headers
req = urllib2.Request(url, None, headers)
#open the response page
page = urllib2.urlopen(req).read()
#find all cache in the page
matches = regex_cache.findall(page)
#loop through the matches
for match in matches:
counter+=1
#find the url of the page cached by google
the_url = regex_url.findall(match)
the_url = the_url[0]
the_url = the_url.replace('http://', '')
the_url = the_url.strip('/')
the_url = the_url.replace('/', '-')
#if href doesn't start with http insert http before
if not match.startswith("http"):
match = "http:" + match
if (not the_url.endswith('html')):
the_url = the_url + ".html"
#if filename "$url"[.html] does not exists
if not os.path.exists(search_site + "/" + the_url):
tmp_req = urllib2.Request(match.replace('&amp;', '&'), None, headers)
try:
tmp_page = urllib2.urlopen(tmp_req).read()
f = open(search_site + "/" + the_url, 'w')
f.write(tmp_page)
f.close()
print counter, ": " + the_url
#comment out the code below if you expect to crawl less than 50 pages
random_interval = random.randrange(15, 20, 1)
print "sleeping for: " + str(random_interval) + " seconds"
time.sleep(random_interval)
except urllib2.HTTPError, e:
print 'Error code: ', e.code
pass
#now check if there is more pages
match = regex_next.search(page)
if match == None:
more = False
else:
url = "http://www.google.com"+match.group(1).replace('&amp;', '&')
 
if __name__=="__main__":
main()

Thank you for this useful script.
I'm trying to run it, it works fine for a while and then i get a 503 Error and get banned from google cache during hours...
Any ideas how to avoid that ? I tried with higher interval value, but still the same result...

++
Khelil

Hello! I tried to run this script, but it only makes directory (empty). Maybe, script is old? Have you any more actual version? Thx

The problem is that Google no longer provides cache: links. It needs to be re-written to take a single URL at webcache.* and then rewrite the URLs in the html to get it's next thing to fetch.

I just switched to using Warrick:
https://code.google.com/p/warrick/wiki/About_Warrick

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.