graham-thomson/get_top_alexa_sites.py

## get_top_alexa_sites.py
import requests
from bs4 import BeautifulSoup
from urlparse import urlparse

def get_top_alexa_sites(category):
    category = category.title()
    categories = ['Adult','Arts','Business','Computers','Games','Health','Home',
                  'Kids and Teens','News','Recreation','Reference','Regional','Science',
                  'Shopping','Society','Sports','World']
    assert(category in categories), "Category {} not in category list: {}".format(category, ", ".join(categories))
    alexa_url = """http://www.alexa.com/topsites/category/Top/{}""".format(category)
    soup = BeautifulSoup(requests.get(alexa_url).content, "html.parser")
    divs = soup.find_all("div")

    top_sites = []

    for i in range(len(divs)):
        try:
            if divs[i]['class'] == [u'td', u'DescriptionCell']:
                div_links = divs[i].find_all('a')
                top_sites += div_links[0].contents
        except KeyError:
            continue

    def format_url(url):
        parsed_url = urlparse(url)
        if len(parsed_url.scheme) > 0 and len(parsed_url.netloc) > 0:
            return str(url.lower())
        elif len(parsed_url.scheme) == 0 and len(parsed_url.netloc) == 0 and len(parsed_url.path) > 0:
            return "http://{}".format(parsed_url.path.lower())
        else:
            return str(url.lower())

    return [format_url(ts_url) for ts_url in top_sites]
	import requests
	from bs4 import BeautifulSoup
	from urlparse import urlparse

	def get_top_alexa_sites(category):
	category = category.title()
	categories = ['Adult','Arts','Business','Computers','Games','Health','Home',
	'Kids and Teens','News','Recreation','Reference','Regional','Science',
	'Shopping','Society','Sports','World']
	assert(category in categories), "Category {} not in category list: {}".format(category, ", ".join(categories))
	alexa_url = """http://www.alexa.com/topsites/category/Top/{}""".format(category)
	soup = BeautifulSoup(requests.get(alexa_url).content, "html.parser")
	divs = soup.find_all("div")

	top_sites = []

	for i in range(len(divs)):
	try:
	if divs[i]['class'] == [u'td', u'DescriptionCell']:
	div_links = divs[i].find_all('a')
	top_sites += div_links[0].contents
	except KeyError:
	continue

	def format_url(url):
	parsed_url = urlparse(url)
	if len(parsed_url.scheme) > 0 and len(parsed_url.netloc) > 0:
	return str(url.lower())
	elif len(parsed_url.scheme) == 0 and len(parsed_url.netloc) == 0 and len(parsed_url.path) > 0:
	return "http://{}".format(parsed_url.path.lower())
	else:
	return str(url.lower())

	return [format_url(ts_url) for ts_url in top_sites]