Last active
December 3, 2022 14:29
-
-
Save risico/e08e1d9c0228928c6fd0416d213fcb3f to your computer and use it in GitHub Desktop.
alexa top sites crawler python (requests + beautifulsoup4)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import time | |
import sys | |
import json | |
from bs4 import BeautifulSoup | |
HEADERS = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' | |
} | |
BASE_URL = "http://www.alexa.com" | |
# In order to not make the target server mad at us, we'll sleep a bit after | |
# each country we get, amount is in seconds | |
WAIT_TIME = 5 | |
COUNTRIES_PAGE_URL = "http://www.alexa.com/topsites/countries" | |
# There're about 25 results per page, lets' get the top 100, so we only need to crawl 4 pages | |
PAGES_PER_COUNTRY = 4 | |
# get the initial page where the countries are listed | |
countries_page = requests.get(COUNTRIES_PAGE_URL, headers=HEADERS) | |
if countries_page.status_code != 200: | |
sys.exit("Ooopsie, page returned %s" % countries_page.status_code) | |
soup = BeautifulSoup(countries_page.text, "html.parser") | |
# get all country listings from the page and go through them | |
for link in soup.select(".countries li a"): | |
# because we want to format our own country page (with the page number added) | |
# we just need the country code | |
# topsites/countries/AT | |
# split at / and get the last element of the list | |
country_code = link.get('href').split('/')[-1] | |
print("Getting country: {}".format(country_code)) | |
# we will store the top sites here, a list will do as we don't need any other info but the address | |
country_top_sites = [] | |
# go through the pages, from zero to the PAGES_PER_COUNTRY | |
# (I know shitty naming) | |
for current_page_number in range(0, PAGES_PER_COUNTRY): | |
# example URL: http://www.alexa.com/topsites/countries;2/AF | |
country_page_url = "{}/topsites/countries;{}/{}".format(BASE_URL, current_page_number, country_code) | |
print(country_page_url) | |
# could get the requst out of the BS and do a check but eh | |
soup = BeautifulSoup(requests.get(country_page_url, headers=HEADERS).text, 'html.parser') | |
# go through all of the sites and add them to our previously set dict | |
for _country in soup.select("div.listings li.site-listing .desc-container .desc-paragraph a"): | |
site = _country.text.lower() | |
country_top_sites.append(site) | |
# after we went through all the country pages, just write the dict as JSON to it's own country file | |
# to be later imported in the database | |
# one could also import them directly from here | |
with open('{}.json'.format(country_code), 'w+') as outfile: | |
json.dump(country_top_sites, outfile) | |
# don't make the target server mad at us, sleep you beauty | |
time.sleep(WAIT_TIME) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment