Skip to content

@qtproduction /scrape.py
Created

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Retrive website from Google Cache without blocking IP
#Retrive old website from Google Cache. Optimized with sleep time, and avoid 504 error (Google block Ip send many request).
#Programmer: Kien Nguyen - QTPros http://qtpros.info/kiennguyen
#change search_site and search_term to match your requirement
#Original: http://www.guyrutenberg.com/2008/10/02/retrieving-googles-cache-for-a-whole-website/
#!/usr/bin/python
import urllib, urllib2
import re
import socket
import os, errno, os.path
import time
import random, math
#import MySQLdb
import imp;
socket.setdefaulttimeout(30)
#adjust the site here
search_site="qtpros.info"
search_term="site:" + search_site
#mysql = imp.load_source("MySQLConnector", "mysql.py").MySQLConnector()
#mysql.connect('localhost','root','','webscrape',True)
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
pass
else: raise
def main():
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4'}
url = "http://www.google.com/search?q="+search_term
regex_cache = re.compile(r'<a href="([^"]*)"[^>]*>Cached</a>')
regex_next = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span><span[^>]*>Next</span></a>')
regex_url = re.compile(r'search\?q=cache:[\d\w-]+:([^%]*)')
# regex_title = re.compile('<title>([\w\W]+)</title>')
# regex_time = re.compile('page as it appeared on ([\d\w\s:]+)')
regex_pagenum = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span>([\d]+)')
#this is the directory we will save files to
mkdir_p(search_site)
path = os.path.dirname(os.path.abspath(__file__)) + '\\' + search_site
# path = os.path.dirname(os.path.abspath(__file__))
counter = 0
pagenum = int(math.floor(len([name for name in os.listdir(path)]) / 10) + 1)
max_goto = 0;
more = True
if (pagenum > 1):
while (max_goto < pagenum):
req = urllib2.Request(url, None, headers)
page = urllib2.urlopen(req).read()
goto = regex_pagenum.findall(page)
# print goto
for goto_url, goto_pagenum in goto:
goto_pagenum = int(goto_pagenum)
if (goto_pagenum == pagenum):
url = "http://www.google.com" + goto_url.replace('&amp;', '&')
max_goto = pagenum
break
elif (goto_pagenum < pagenum and max_goto < goto_pagenum):
max_goto = goto_pagenum
url = "http://www.google.com" + goto_url.replace('&amp;', '&')
random_interval = random.randrange(5, 20, 1)
print "sleeping for: " + str(random_interval) + " seconds"
print "going to page: " + str(max_goto)
print url
time.sleep(random_interval)
while(more):
#Send search request to google with pre-defined headers
req = urllib2.Request(url, None, headers)
#open the response page
page = urllib2.urlopen(req).read()
#find all cache in the page
matches = regex_cache.findall(page)
#loop through the matches
for match in matches:
counter+=1
#find the url of the page cached by google
the_url = regex_url.findall(match)
the_url = the_url[0]
the_url = the_url.replace('http://', '')
the_url = the_url.strip('/')
the_url = the_url.replace('/', '-')
#if href doesn't start with http insert http before
if not match.startswith("http"):
match = "http:" + match
if (not the_url.endswith('html')):
the_url = the_url + ".html"
#if filename "$url"[.html] does not exists
if not os.path.exists(search_site + "/" + the_url):
tmp_req = urllib2.Request(match.replace('&amp;', '&'), None, headers)
try:
tmp_page = urllib2.urlopen(tmp_req).read()
f = open(search_site + "/" + the_url, 'w')
f.write(tmp_page)
f.close()
print counter, ": " + the_url
#comment out the code below if you expect to crawl less than 50 pages
random_interval = random.randrange(15, 20, 1)
print "sleeping for: " + str(random_interval) + " seconds"
time.sleep(random_interval)
except urllib2.HTTPError, e:
print 'Error code: ', e.code
pass
#now check if there is more pages
match = regex_next.search(page)
if match == None:
more = False
else:
url = "http://www.google.com"+match.group(1).replace('&amp;', '&')
if __name__=="__main__":
main()
@khelil

Thank you for this useful script.
I'm trying to run it, it works fine for a while and then i get a 503 Error and get banned from google cache during hours...
Any ideas how to avoid that ? I tried with higher interval value, but still the same result...

++
Khelil

@mav1

Hello! I tried to run this script, but it only makes directory (empty). Maybe, script is old? Have you any more actual version? Thx

@jjeffus

The problem is that Google no longer provides cache: links. It needs to be re-written to take a single URL at webcache.* and then rewrite the URLs in the html to get it's next thing to fetch.

I just switched to using Warrick:
https://code.google.com/p/warrick/wiki/About_Warrick

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.