Skip to content

Instantly share code, notes, and snippets.

@akx
Created October 19, 2012 18:21
Show Gist options
  • Save akx/3919793 to your computer and use it in GitHub Desktop.
Save akx/3919793 to your computer and use it in GitHub Desktop.
### Google cache link crawler
### Public domain and all
### requires the Requests library for Python (pip install requests / easy_install requests)
### requires Python 2.6+, maybe Python 2.7. Who knows!
###
### TL;DR:
### python gcache2.py -p somesite.shelf -a get-links some-site.fi
### (get several tons of coffee while it runs - the delays are long)
### python gcache2.py -p somesite.shelf -a list-queue
from requests.defaults import defaults as req_defaults
import argparse
import contextlib
import random
import re
import requests
import shelve
import subprocess
import time
USER_AGENTS = """
Mozilla/4.0 (compatible; MSIE 5.0; Windows NT 5.1; .NET CLR 1.1.4322)
Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.1) Opera 7.02 [en]
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.0
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20060127 Netscape/8.1
Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4
Opera/9.00 (Windows NT 5.1; U; en)
Opera/9.20 (Windows NT 6.0; U; en)
""".strip().splitlines()
USER_AGENT = random.choice(USER_AGENTS)
CACHE_LINK_RE = re.compile('<a href="(//webcache.google.+?)">')
req_defaults['base_headers']["User-Agent"] = USER_AGENT
shelf = None
def get_google_cache_links(kw, max_pages=500):
sess = requests.session()
last_page_key = "last-page:%s" % kw.encode("UTF-8")
first_page = shelf.get(last_page_key, 0)
print "Starting from page %d" % first_page
for page in xrange(first_page, first_page + max_pages):
shelf[last_page_key] = page
print "Getting page %d" % page
resp = sess.get("http://www.google.com/search", params={"q": kw, "start": page * 10})
print "==> %r" % resp.url
content = resp.content
cache_links_page = set(m.group(1) for m in CACHE_LINK_RE.finditer(content))
shelf["queue"] |= cache_links_page
print "%d Cached links in queue." % len(shelf["queue"])
delay = random.randint(10, 40)
print "Waiting for %d seconds..." % delay
time.sleep(delay)
def reformat_queue():
q = sorted(shelf["queue"])
q = [("http:%s" % u if u.startswith("//") else u) for u in q]
return q
def main():
global shelf
ap = argparse.ArgumentParser()
ap.add_argument("-p", dest="project", help="project file (will be used to save state)", required=True)
ap.add_argument("-a", dest="action", help="action (get-links, list-queue, run-wget)", required=True)
ap.add_argument("-s", dest="site", help="site (for get-links)")
args = ap.parse_args()
with contextlib.closing(shelve.open(args.project)) as shelf:
if "queue" not in shelf:
shelf["queue"] = set()
if args.action == "get-links":
if not args.site:
ap.error("-s is required for get-links (should be domain)")
get_google_cache_links("site:%s" % args.site)
elif args.action == "list-queue":
print "\n".join(reformat_queue())
elif args.action == "run-wget":
proc = subprocess.Popen(["wget", "-i", "-", "-U", USER_AGENT, "-w", "30"], stdin=subprocess.PIPE)
q = reformat_queue()
random.shuffle(q)
proc.stdin.write("\n".join(q))
proc.stdin.close()
proc.wait()
else:
ap.error("Unknown action %s" % args.action)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment