Skip to content

Instantly share code, notes, and snippets.

@graham-thomson
Created March 7, 2017 15:57
Show Gist options
  • Save graham-thomson/e9bf65ff17d214b144f91680cb81d438 to your computer and use it in GitHub Desktop.
Save graham-thomson/e9bf65ff17d214b144f91680cb81d438 to your computer and use it in GitHub Desktop.
Function to get top Alexa.com domains by category
import requests
from bs4 import BeautifulSoup
from urlparse import urlparse
def get_top_alexa_sites(category):
category = category.title()
categories = ['Adult','Arts','Business','Computers','Games','Health','Home',
'Kids and Teens','News','Recreation','Reference','Regional','Science',
'Shopping','Society','Sports','World']
assert(category in categories), "Category {} not in category list: {}".format(category, ", ".join(categories))
alexa_url = """http://www.alexa.com/topsites/category/Top/{}""".format(category)
soup = BeautifulSoup(requests.get(alexa_url).content, "html.parser")
divs = soup.find_all("div")
top_sites = []
for i in range(len(divs)):
try:
if divs[i]['class'] == [u'td', u'DescriptionCell']:
div_links = divs[i].find_all('a')
top_sites += div_links[0].contents
except KeyError:
continue
def format_url(url):
parsed_url = urlparse(url)
if len(parsed_url.scheme) > 0 and len(parsed_url.netloc) > 0:
return str(url.lower())
elif len(parsed_url.scheme) == 0 and len(parsed_url.netloc) == 0 and len(parsed_url.path) > 0:
return "http://{}".format(parsed_url.path.lower())
else:
return str(url.lower())
return [format_url(ts_url) for ts_url in top_sites]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment