Skip to content

Instantly share code, notes, and snippets.

@risico
Last active December 3, 2022 14:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save risico/e08e1d9c0228928c6fd0416d213fcb3f to your computer and use it in GitHub Desktop.
Save risico/e08e1d9c0228928c6fd0416d213fcb3f to your computer and use it in GitHub Desktop.
alexa top sites crawler python (requests + beautifulsoup4)
import requests
import time
import sys
import json
from bs4 import BeautifulSoup
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
}
BASE_URL = "http://www.alexa.com"
# In order to not make the target server mad at us, we'll sleep a bit after
# each country we get, amount is in seconds
WAIT_TIME = 5
COUNTRIES_PAGE_URL = "http://www.alexa.com/topsites/countries"
# There're about 25 results per page, lets' get the top 100, so we only need to crawl 4 pages
PAGES_PER_COUNTRY = 4
# get the initial page where the countries are listed
countries_page = requests.get(COUNTRIES_PAGE_URL, headers=HEADERS)
if countries_page.status_code != 200:
sys.exit("Ooopsie, page returned %s" % countries_page.status_code)
soup = BeautifulSoup(countries_page.text, "html.parser")
# get all country listings from the page and go through them
for link in soup.select(".countries li a"):
# because we want to format our own country page (with the page number added)
# we just need the country code
# topsites/countries/AT
# split at / and get the last element of the list
country_code = link.get('href').split('/')[-1]
print("Getting country: {}".format(country_code))
# we will store the top sites here, a list will do as we don't need any other info but the address
country_top_sites = []
# go through the pages, from zero to the PAGES_PER_COUNTRY
# (I know shitty naming)
for current_page_number in range(0, PAGES_PER_COUNTRY):
# example URL: http://www.alexa.com/topsites/countries;2/AF
country_page_url = "{}/topsites/countries;{}/{}".format(BASE_URL, current_page_number, country_code)
print(country_page_url)
# could get the requst out of the BS and do a check but eh
soup = BeautifulSoup(requests.get(country_page_url, headers=HEADERS).text, 'html.parser')
# go through all of the sites and add them to our previously set dict
for _country in soup.select("div.listings li.site-listing .desc-container .desc-paragraph a"):
site = _country.text.lower()
country_top_sites.append(site)
# after we went through all the country pages, just write the dict as JSON to it's own country file
# to be later imported in the database
# one could also import them directly from here
with open('{}.json'.format(country_code), 'w+') as outfile:
json.dump(country_top_sites, outfile)
# don't make the target server mad at us, sleep you beauty
time.sleep(WAIT_TIME)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment