-
-
Save qtproduction/3787790 to your computer and use it in GitHub Desktop.
#Retrive old website from Google Cache. Optimized with sleep time, and avoid 504 error (Google block Ip send many request). | |
#Programmer: Kien Nguyen - QTPros http://qtpros.info/kiennguyen | |
#change search_site and search_term to match your requirement | |
#Original: http://www.guyrutenberg.com/2008/10/02/retrieving-googles-cache-for-a-whole-website/ | |
#!/usr/bin/python | |
import urllib, urllib2 | |
import re | |
import socket | |
import os, errno, os.path | |
import time | |
import random, math | |
#import MySQLdb | |
import imp; | |
socket.setdefaulttimeout(30) | |
#adjust the site here | |
search_site="qtpros.info" | |
search_term="site:" + search_site | |
#mysql = imp.load_source("MySQLConnector", "mysql.py").MySQLConnector() | |
#mysql.connect('localhost','root','','webscrape',True) | |
def mkdir_p(path): | |
try: | |
os.makedirs(path) | |
except OSError as exc: # Python >2.5 | |
if exc.errno == errno.EEXIST: | |
pass | |
else: raise | |
def main(): | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4'} | |
url = "http://www.google.com/search?q="+search_term | |
regex_cache = re.compile(r'<a href="([^"]*)"[^>]*>Cached</a>') | |
regex_next = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span><span[^>]*>Next</span></a>') | |
regex_url = re.compile(r'search\?q=cache:[\d\w-]+:([^%]*)') | |
# regex_title = re.compile('<title>([\w\W]+)</title>') | |
# regex_time = re.compile('page as it appeared on ([\d\w\s:]+)') | |
regex_pagenum = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span>([\d]+)') | |
#this is the directory we will save files to | |
mkdir_p(search_site) | |
path = os.path.dirname(os.path.abspath(__file__)) + '\\' + search_site | |
# path = os.path.dirname(os.path.abspath(__file__)) | |
counter = 0 | |
pagenum = int(math.floor(len([name for name in os.listdir(path)]) / 10) + 1) | |
max_goto = 0; | |
more = True | |
if (pagenum > 1): | |
while (max_goto < pagenum): | |
req = urllib2.Request(url, None, headers) | |
page = urllib2.urlopen(req).read() | |
goto = regex_pagenum.findall(page) | |
# print goto | |
for goto_url, goto_pagenum in goto: | |
goto_pagenum = int(goto_pagenum) | |
if (goto_pagenum == pagenum): | |
url = "http://www.google.com" + goto_url.replace('&', '&') | |
max_goto = pagenum | |
break | |
elif (goto_pagenum < pagenum and max_goto < goto_pagenum): | |
max_goto = goto_pagenum | |
url = "http://www.google.com" + goto_url.replace('&', '&') | |
random_interval = random.randrange(5, 20, 1) | |
print "sleeping for: " + str(random_interval) + " seconds" | |
print "going to page: " + str(max_goto) | |
print url | |
time.sleep(random_interval) | |
while(more): | |
#Send search request to google with pre-defined headers | |
req = urllib2.Request(url, None, headers) | |
#open the response page | |
page = urllib2.urlopen(req).read() | |
#find all cache in the page | |
matches = regex_cache.findall(page) | |
#loop through the matches | |
for match in matches: | |
counter+=1 | |
#find the url of the page cached by google | |
the_url = regex_url.findall(match) | |
the_url = the_url[0] | |
the_url = the_url.replace('http://', '') | |
the_url = the_url.strip('/') | |
the_url = the_url.replace('/', '-') | |
#if href doesn't start with http insert http before | |
if not match.startswith("http"): | |
match = "http:" + match | |
if (not the_url.endswith('html')): | |
the_url = the_url + ".html" | |
#if filename "$url"[.html] does not exists | |
if not os.path.exists(search_site + "/" + the_url): | |
tmp_req = urllib2.Request(match.replace('&', '&'), None, headers) | |
try: | |
tmp_page = urllib2.urlopen(tmp_req).read() | |
f = open(search_site + "/" + the_url, 'w') | |
f.write(tmp_page) | |
f.close() | |
print counter, ": " + the_url | |
#comment out the code below if you expect to crawl less than 50 pages | |
random_interval = random.randrange(15, 20, 1) | |
print "sleeping for: " + str(random_interval) + " seconds" | |
time.sleep(random_interval) | |
except urllib2.HTTPError, e: | |
print 'Error code: ', e.code | |
pass | |
#now check if there is more pages | |
match = regex_next.search(page) | |
if match == None: | |
more = False | |
else: | |
url = "http://www.google.com"+match.group(1).replace('&', '&') | |
if __name__=="__main__": | |
main() |
Hello! I tried to run this script, but it only makes directory (empty). Maybe, script is old? Have you any more actual version? Thx
The problem is that Google no longer provides cache: links. It needs to be re-written to take a single URL at webcache.* and then rewrite the URLs in the html to get it's next thing to fetch.
I just switched to using Warrick:
https://code.google.com/p/warrick/wiki/About_Warrick
Any updated script ?
direct access the web-cache + alternatives
you can still access pages manually (not all websites are available!):
- remove protocol and
://
, use whatever equivalent in your desired language to JavaScript'sencodeURIComponent
is and add at the end ofhttps://webcache.googleusercontent.com/search?ie=UTF-8&q=cache:
.
+ you will never get blocked,
+ no need to query google.com,
+ no need to filter-out HTML/DOM structure tree,
+ there is no captcha, and very few minimal restrictions (mostly just rate as with most google servers).
+ if you get error code (5**/4**) or empty response 'body', it simply means there isn't entry in the cache for that website.
as this is essentially a mirror/archive,
there are alternatives you can fall-back to (or switch as your main scraping target, as both are superior to google-cache)
-
encodeURIComponent
your URL (as is) and add at the end ofhttps://web.archive.org/web/2019*/
(can change2019
for earlier results, various 'api's are available to narrow down results). -
same as above to
https://archive.is/
. -
you can also (although not intended to be used this way) the google translate.
you don't have to actually translate, your target language can always be English, even if the original website is already in English.
the result will be the website under a different (google) domain,
start with https://translate.google.com/translate?hl=en&sl=auto&tl=en&u=
and your website (escaped, as before),
now days it will redirect to the new interface,
https://____the_original_hostname______.translate.goog/___any_path_from_the_original_url____/?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en
for example:
https://programmerall.com/article/7877189061/
will be https://translate.google.com/translate?hl=en&sl=auto&tl=en&u=https%3A%2F%2Fprogrammerall.com%2Farticle%2F7877189061%2F
which will (now days) redirect to:
https://programmerall-com.translate.goog/article/7877189061/?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en
there is a bonus, in which you will browse old HTTP website through the mirror-server HTTPS ;]
if you can't find the article/website you are looking for, just save it yourself.
https://archive.is/?url=_____escape_your_url_here
https://web.archive.org/save
and then (you need to wait until those will end mirroring the website) simply try accessing the website again.
(you can't trigger yourself the google cache archiving process..)
Thank you for this useful script.
I'm trying to run it, it works fine for a while and then i get a 503 Error and get banned from google cache during hours...
Any ideas how to avoid that ? I tried with higher interval value, but still the same result...
++
Khelil