CharlesCai930/google.py

## google.py
#!/usr/local/bin/python

import sys
from lxml import html
from urlparse import urljoin
import urllib2
import requests
import logging

visited_links = []

def get_links(url):
  ''' Gets all the links on a page.  While that's happening this parses for Goo.gl short links
  in text and prints them to the log, including a beep for tail.'''
  page = requests.get(url, verify=False);
  try:
    tree = html.fromstring(page.text)
  except:
    logging.error("Couldn't parse this html.")
    return

  special = tree.xpath("//text()[contains(.,'goo.gl')]")
  if special is not None:
    for spec in special:
      logging.info('\afound shortlink: ' + spec)

  return tree.xpath('//a/@href')


def crawl(seed):
  '''Crawls this page and then all linked pages from here.'''
  queue = []
  queue.append(seed)

  while queue:
    link = queue.pop(0)
    if link.startswith('/'):
      link = urljoin(seed, link)

    logging.debug("Crawling " + link)

    if link in visited_links:
      continue;
    visited_links.append(link)

    newLinks = get_links(link)
    for l in newLinks:
      if l and l not in visited_links and l.startswith('https://developers.google.com') or l.startswith('/'):
        queue.append(l)

# Read the seed url from the command line
start = 'https://developers.google.com'
if len(sys.argv) > 1:
  start = str(sys.argv[1])

# Set up logging
logging.basicConfig(filename=sys.argv[2], level=logging.DEBUG)
requests_log = logging.getLogger('requests');
requests_log.setLevel(logging.WARNING)

# working link
#crawl("https://developers.google.com/analytics/community/index?home")
crawl(start)
	#!/usr/local/bin/python

	import sys
	from lxml import html
	from urlparse import urljoin
	import urllib2
	import requests
	import logging

	visited_links = []

	def get_links(url):
	''' Gets all the links on a page. While that's happening this parses for Goo.gl short links
	in text and prints them to the log, including a beep for tail.'''
	page = requests.get(url, verify=False);
	try:
	tree = html.fromstring(page.text)
	except:
	logging.error("Couldn't parse this html.")
	return

	special = tree.xpath("//text()[contains(.,'goo.gl')]")
	if special is not None:
	for spec in special:
	logging.info('\afound shortlink: ' + spec)

	return tree.xpath('//a/@href')


	def crawl(seed):
	'''Crawls this page and then all linked pages from here.'''
	queue = []
	queue.append(seed)

	while queue:
	link = queue.pop(0)
	if link.startswith('/'):
	link = urljoin(seed, link)

	logging.debug("Crawling " + link)

	if link in visited_links:
	continue;
	visited_links.append(link)

	newLinks = get_links(link)
	for l in newLinks:
	if l and l not in visited_links and l.startswith('https://developers.google.com') or l.startswith('/'):
	queue.append(l)

	# Read the seed url from the command line
	start = 'https://developers.google.com'
	if len(sys.argv) > 1:
	start = str(sys.argv[1])

	# Set up logging
	logging.basicConfig(filename=sys.argv[2], level=logging.DEBUG)
	requests_log = logging.getLogger('requests');
	requests_log.setLevel(logging.WARNING)

	# working link
	#crawl("https://developers.google.com/analytics/community/index?home")
	crawl(start)