Created
October 19, 2012 18:21
-
-
Save akx/3919793 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Google cache link crawler | |
### Public domain and all | |
### requires the Requests library for Python (pip install requests / easy_install requests) | |
### requires Python 2.6+, maybe Python 2.7. Who knows! | |
### | |
### TL;DR: | |
### python gcache2.py -p somesite.shelf -a get-links some-site.fi | |
### (get several tons of coffee while it runs - the delays are long) | |
### python gcache2.py -p somesite.shelf -a list-queue | |
from requests.defaults import defaults as req_defaults | |
import argparse | |
import contextlib | |
import random | |
import re | |
import requests | |
import shelve | |
import subprocess | |
import time | |
USER_AGENTS = """ | |
Mozilla/4.0 (compatible; MSIE 5.0; Windows NT 5.1; .NET CLR 1.1.4322) | |
Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.1) Opera 7.02 [en] | |
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322) | |
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.0 | |
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50 | |
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1) | |
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30) | |
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6 | |
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20060127 Netscape/8.1 | |
Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4 | |
Opera/9.00 (Windows NT 5.1; U; en) | |
Opera/9.20 (Windows NT 6.0; U; en) | |
""".strip().splitlines() | |
USER_AGENT = random.choice(USER_AGENTS) | |
CACHE_LINK_RE = re.compile('<a href="(//webcache.google.+?)">') | |
req_defaults['base_headers']["User-Agent"] = USER_AGENT | |
shelf = None | |
def get_google_cache_links(kw, max_pages=500): | |
sess = requests.session() | |
last_page_key = "last-page:%s" % kw.encode("UTF-8") | |
first_page = shelf.get(last_page_key, 0) | |
print "Starting from page %d" % first_page | |
for page in xrange(first_page, first_page + max_pages): | |
shelf[last_page_key] = page | |
print "Getting page %d" % page | |
resp = sess.get("http://www.google.com/search", params={"q": kw, "start": page * 10}) | |
print "==> %r" % resp.url | |
content = resp.content | |
cache_links_page = set(m.group(1) for m in CACHE_LINK_RE.finditer(content)) | |
shelf["queue"] |= cache_links_page | |
print "%d Cached links in queue." % len(shelf["queue"]) | |
delay = random.randint(10, 40) | |
print "Waiting for %d seconds..." % delay | |
time.sleep(delay) | |
def reformat_queue(): | |
q = sorted(shelf["queue"]) | |
q = [("http:%s" % u if u.startswith("//") else u) for u in q] | |
return q | |
def main(): | |
global shelf | |
ap = argparse.ArgumentParser() | |
ap.add_argument("-p", dest="project", help="project file (will be used to save state)", required=True) | |
ap.add_argument("-a", dest="action", help="action (get-links, list-queue, run-wget)", required=True) | |
ap.add_argument("-s", dest="site", help="site (for get-links)") | |
args = ap.parse_args() | |
with contextlib.closing(shelve.open(args.project)) as shelf: | |
if "queue" not in shelf: | |
shelf["queue"] = set() | |
if args.action == "get-links": | |
if not args.site: | |
ap.error("-s is required for get-links (should be domain)") | |
get_google_cache_links("site:%s" % args.site) | |
elif args.action == "list-queue": | |
print "\n".join(reformat_queue()) | |
elif args.action == "run-wget": | |
proc = subprocess.Popen(["wget", "-i", "-", "-U", USER_AGENT, "-w", "30"], stdin=subprocess.PIPE) | |
q = reformat_queue() | |
random.shuffle(q) | |
proc.stdin.write("\n".join(q)) | |
proc.stdin.close() | |
proc.wait() | |
else: | |
ap.error("Unknown action %s" % args.action) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment