akx/gcache2.py

## gcache2.py
### Google cache link crawler
### Public domain and all
### requires the Requests library for Python (pip install requests / easy_install requests)
### requires Python 2.6+, maybe Python 2.7. Who knows!
###
### TL;DR:
### python gcache2.py -p somesite.shelf -a get-links some-site.fi
### (get several tons of coffee while it runs - the delays are long)
### python gcache2.py -p somesite.shelf -a list-queue

from requests.defaults import defaults as req_defaults
import argparse
import contextlib
import random
import re
import requests
import shelve
import subprocess
import time

USER_AGENTS = """
Mozilla/4.0 (compatible; MSIE 5.0; Windows NT 5.1; .NET CLR 1.1.4322)
Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.1) Opera 7.02 [en]
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.0
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20060127 Netscape/8.1
Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4
Opera/9.00 (Windows NT 5.1; U; en)
Opera/9.20 (Windows NT 6.0; U; en)
""".strip().splitlines()

USER_AGENT = random.choice(USER_AGENTS)
CACHE_LINK_RE = re.compile('<a href="(//webcache.google.+?)">')
req_defaults['base_headers']["User-Agent"] = USER_AGENT
shelf = None

def get_google_cache_links(kw, max_pages=500):
    sess = requests.session()
    last_page_key = "last-page:%s" % kw.encode("UTF-8")
    first_page = shelf.get(last_page_key, 0)
    print "Starting from page %d" % first_page
    for page in xrange(first_page, first_page + max_pages):
        shelf[last_page_key] = page
        print "Getting page %d" % page
        resp = sess.get("http://www.google.com/search", params={"q": kw, "start": page * 10})
        print "==> %r" % resp.url
        content = resp.content
        cache_links_page = set(m.group(1) for m in CACHE_LINK_RE.finditer(content))
        shelf["queue"] |= cache_links_page
        print "%d Cached links in queue." % len(shelf["queue"])
        delay = random.randint(10, 40)
        print "Waiting for %d seconds..." % delay
        time.sleep(delay)

def reformat_queue():
    q = sorted(shelf["queue"])
    q = [("http:%s" % u if u.startswith("//") else u) for u in q]
    return q

def main():
    global shelf
    ap = argparse.ArgumentParser()
    ap.add_argument("-p", dest="project", help="project file (will be used to save state)", required=True)
    ap.add_argument("-a", dest="action", help="action (get-links, list-queue, run-wget)", required=True)
    ap.add_argument("-s", dest="site", help="site (for get-links)")
    args = ap.parse_args()

    with contextlib.closing(shelve.open(args.project)) as shelf:
        if "queue" not in shelf:
            shelf["queue"] = set()
        if args.action == "get-links":
            if not args.site:
                ap.error("-s is required for get-links (should be domain)")
            get_google_cache_links("site:%s" % args.site)
        elif args.action == "list-queue":
            print "\n".join(reformat_queue())
        elif args.action == "run-wget":
            proc = subprocess.Popen(["wget", "-i", "-", "-U", USER_AGENT, "-w", "30"], stdin=subprocess.PIPE)
            q = reformat_queue()
            random.shuffle(q)
            proc.stdin.write("\n".join(q))
            proc.stdin.close()
            proc.wait()
        else:
            ap.error("Unknown action %s" % args.action)

if __name__ == '__main__':
    main()
	### Google cache link crawler
	### Public domain and all
	### requires the Requests library for Python (pip install requests / easy_install requests)
	### requires Python 2.6+, maybe Python 2.7. Who knows!
	###
	### TL;DR:
	### python gcache2.py -p somesite.shelf -a get-links some-site.fi
	### (get several tons of coffee while it runs - the delays are long)
	### python gcache2.py -p somesite.shelf -a list-queue

	from requests.defaults import defaults as req_defaults
	import argparse
	import contextlib
	import random
	import re
	import requests
	import shelve
	import subprocess
	import time

	USER_AGENTS = """
	Mozilla/4.0 (compatible; MSIE 5.0; Windows NT 5.1; .NET CLR 1.1.4322)
	Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.1) Opera 7.02 [en]
	Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)
	Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.0
	Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50
	Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)
	Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)
	Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6
	Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20060127 Netscape/8.1
	Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4
	Opera/9.00 (Windows NT 5.1; U; en)
	Opera/9.20 (Windows NT 6.0; U; en)
	""".strip().splitlines()

	USER_AGENT = random.choice(USER_AGENTS)
	CACHE_LINK_RE = re.compile('<a href="(//webcache.google.+?)">')
	req_defaults['base_headers']["User-Agent"] = USER_AGENT
	shelf = None

	def get_google_cache_links(kw, max_pages=500):
	sess = requests.session()
	last_page_key = "last-page:%s" % kw.encode("UTF-8")
	first_page = shelf.get(last_page_key, 0)
	print "Starting from page %d" % first_page
	for page in xrange(first_page, first_page + max_pages):
	shelf[last_page_key] = page
	print "Getting page %d" % page
	resp = sess.get("http://www.google.com/search", params={"q": kw, "start": page * 10})
	print "==> %r" % resp.url
	content = resp.content
	cache_links_page = set(m.group(1) for m in CACHE_LINK_RE.finditer(content))
	shelf["queue"] \|= cache_links_page
	print "%d Cached links in queue." % len(shelf["queue"])
	delay = random.randint(10, 40)
	print "Waiting for %d seconds..." % delay
	time.sleep(delay)

	def reformat_queue():
	q = sorted(shelf["queue"])
	q = [("http:%s" % u if u.startswith("//") else u) for u in q]
	return q

	def main():
	global shelf
	ap = argparse.ArgumentParser()
	ap.add_argument("-p", dest="project", help="project file (will be used to save state)", required=True)
	ap.add_argument("-a", dest="action", help="action (get-links, list-queue, run-wget)", required=True)
	ap.add_argument("-s", dest="site", help="site (for get-links)")
	args = ap.parse_args()

	with contextlib.closing(shelve.open(args.project)) as shelf:
	if "queue" not in shelf:
	shelf["queue"] = set()
	if args.action == "get-links":
	if not args.site:
	ap.error("-s is required for get-links (should be domain)")
	get_google_cache_links("site:%s" % args.site)
	elif args.action == "list-queue":
	print "\n".join(reformat_queue())
	elif args.action == "run-wget":
	proc = subprocess.Popen(["wget", "-i", "-", "-U", USER_AGENT, "-w", "30"], stdin=subprocess.PIPE)
	q = reformat_queue()
	random.shuffle(q)
	proc.stdin.write("\n".join(q))
	proc.stdin.close()
	proc.wait()
	else:
	ap.error("Unknown action %s" % args.action)

	if __name__ == '__main__':
	main()