eqhes/README.md

## README.md

      
    Raw
  

              README.md
            
          
    #Script to extract to data from google cache
This is a simple python script which retrieves content like your blog posts from googles cache, you can use this if your servers hard drive crashes or if a meteor hits your data center.
Remove the time.sleep code if you don't have more than 50 pages to retrieve.
Added check if file exists before downloading in case you need to rerun the script
Added "-u" param to pass the website URL from the command line
##Original Source

http://www.guyrutenberg.com/2008/10/02/retrieving-googles-cache-for-a-whole-website/


## google-cache-retrieve.py
#!/usr/bin/python
import urllib
import urllib2
import re
import socket
import os
import time
import random

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-u', required=True, help="usage: -u website-url.ext, for example: -u google.com\n")
args = parser.parse_args()

socket.setdefaulttimeout(30)
#adjust the site here
search_site=args.u;
search_term="site:" + search_site
def main():
    headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4'}
    url = "http://www.google.com/search?q="+search_term

    regex_cache = re.compile(r'<a href="([^"]*)"[^>]*>Cached</a>')
    regex_next = re.compile('<a href="([^"]*)"[^>]*><span[^>]*>[^<]*</span><span[^>]*>Next</span></a>')

    #this is the directory we will save files to
    try:
        os.mkdir(search_site)
    except:
        pass
    counter = 0
    pagenum = 0
    more = True
    while(more):
        pagenum += 1
        print "PAGE "+str(pagenum)+": "+url
        req = urllib2.Request(url, None, headers)
        page = urllib2.urlopen(req).read()
        matches = regex_cache.findall(page)
        print matches
        for match in matches:
            counter+=1
            if not match.startswith("http"):
                match = "http:" + match
            if not os.path.exists(search_site + "/" + str(counter)+'.html'):
                tmp_req = urllib2.Request(match.replace('&amp;','&'), None, headers)
                tmp_page = urllib2.urlopen(tmp_req).read()
                print counter,": "+match
                f = open(search_site + "/" + str(counter)+'.html','w')
                f.write(tmp_page)
                f.close()
                #comment out the code below if you expect to crawl less than 50 pages
                random_interval=random.randrange(1,10,1)
                print "sleeping for: " + str(random_interval) + " seconds"
                time.sleep(random_interval)
        #now check if there is more pages
        match = regex_next.search(page)
        if match == None:
            more = False
        else:
            url = "http://www.google.com"+match.group(1).replace('&amp;','&')

if __name__=="__main__":
    main()

# vim: ai ts=4 sts=4 et sw=4
	#!/usr/bin/python
	import urllib
	import urllib2
	import re
	import socket
	import os
	import time
	import random

	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument('-u', required=True, help="usage: -u website-url.ext, for example: -u google.com\n")
	args = parser.parse_args()

	socket.setdefaulttimeout(30)
	#adjust the site here
	search_site=args.u;
	search_term="site:" + search_site
	def main():
	headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4'}
	url = "http://www.google.com/search?q="+search_term

	regex_cache = re.compile(r'<a href="([^"])"[^>]>Cached</a>')
	regex_next = re.compile('<a href="([^"])"[^>]><span[^>]>[^<]</span><span[^>]*>Next</span></a>')

	#this is the directory we will save files to
	try:
	os.mkdir(search_site)
	except:
	pass
	counter = 0
	pagenum = 0
	more = True
	while(more):
	pagenum += 1
	print "PAGE "+str(pagenum)+": "+url
	req = urllib2.Request(url, None, headers)
	page = urllib2.urlopen(req).read()
	matches = regex_cache.findall(page)
	print matches
	for match in matches:
	counter+=1
	if not match.startswith("http"):
	match = "http:" + match
	if not os.path.exists(search_site + "/" + str(counter)+'.html'):
	tmp_req = urllib2.Request(match.replace('&','&'), None, headers)
	tmp_page = urllib2.urlopen(tmp_req).read()
	print counter,": "+match
	f = open(search_site + "/" + str(counter)+'.html','w')
	f.write(tmp_page)
	f.close()
	#comment out the code below if you expect to crawl less than 50 pages
	random_interval=random.randrange(1,10,1)
	print "sleeping for: " + str(random_interval) + " seconds"
	time.sleep(random_interval)
	#now check if there is more pages
	match = regex_next.search(page)
	if match == None:
	more = False
	else:
	url = "http://www.google.com"+match.group(1).replace('&','&')

	if __name__=="__main__":
	main()

	# vim: ai ts=4 sts=4 et sw=4