Skip to content

Instantly share code, notes, and snippets.

@englehardt
Last active September 14, 2020 17:01
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save englehardt/6b81799a6a8005fa9515 to your computer and use it in GitHub Desktop.
Save englehardt/6b81799a6a8005fa9515 to your computer and use it in GitHub Desktop.
A scraper that grabs urls for the top 500 sites in each Alexa category. Requires python packages `dill` and `bs4`.
from collections import defaultdict
import dill
import requests
from bs4 import BeautifulSoup
alexa_categories = defaultdict(list)
BASE_URL = 'http://www.alexa.com/topsites/category'
print "Grabbing categories of top sites from %s" % BASE_URL
resp = requests.get('http://www.alexa.com/topsites/category')
soup = BeautifulSoup(resp.text, 'lxml')
categories = soup.find('div', {'class', 'categories'})
for item in categories.findAll('a'):
suffix = item.get('href').split('/')[-1]
category = suffix.lower()
if category == 'world': # requires sub-category
continue
for i in range(20):
if i == 0:
num = ''
else:
num = ';' + str(i)
url = BASE_URL + num + '/Top/' + suffix
print "Requesting %s" % url
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
links = soup.findAll('div', {'class': 'DescriptionCell'})
for link in links:
site_url = link.a.text.lower()
if site_url.startswith('http://'):
site_url = site_url[7:]
elif site_url.startswith('https://'):
site_url = site_url[8:]
alexa_categories[category].append(site_url)
print "The top 500 sites in Alexa category %s are:" % category
for i in range(len(alexa_categories[category])):
print "%i: %s" % (i, alexa_categories[category][i])
with open('alexa_categories.dill', 'w') as f:
dill.dump(alexa_categories, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment