Skip to content

Instantly share code, notes, and snippets.

@CharlesCai930
Forked from rharter/google.py
Created April 16, 2014 18:43
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CharlesCai930/10919047 to your computer and use it in GitHub Desktop.
Save CharlesCai930/10919047 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/python
import sys
from lxml import html
from urlparse import urljoin
import urllib2
import requests
import logging
visited_links = []
def get_links(url):
''' Gets all the links on a page. While that's happening this parses for Goo.gl short links
in text and prints them to the log, including a beep for tail.'''
page = requests.get(url, verify=False);
try:
tree = html.fromstring(page.text)
except:
logging.error("Couldn't parse this html.")
return
special = tree.xpath("//text()[contains(.,'goo.gl')]")
if special is not None:
for spec in special:
logging.info('\afound shortlink: ' + spec)
return tree.xpath('//a/@href')
def crawl(seed):
'''Crawls this page and then all linked pages from here.'''
queue = []
queue.append(seed)
while queue:
link = queue.pop(0)
if link.startswith('/'):
link = urljoin(seed, link)
logging.debug("Crawling " + link)
if link in visited_links:
continue;
visited_links.append(link)
newLinks = get_links(link)
for l in newLinks:
if l and l not in visited_links and l.startswith('https://developers.google.com') or l.startswith('/'):
queue.append(l)
# Read the seed url from the command line
start = 'https://developers.google.com'
if len(sys.argv) > 1:
start = str(sys.argv[1])
# Set up logging
logging.basicConfig(filename=sys.argv[2], level=logging.DEBUG)
requests_log = logging.getLogger('requests');
requests_log.setLevel(logging.WARNING)
# working link
#crawl("https://developers.google.com/analytics/community/index?home")
crawl(start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment