risico/crawler.py

## crawler.py
import requests
import time
import sys
import json

from bs4 import BeautifulSoup

HEADERS = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
}

BASE_URL =  "http://www.alexa.com"

# In order to not make the target server mad at us, we'll sleep a bit after
# each country we get, amount is in seconds
WAIT_TIME = 5

COUNTRIES_PAGE_URL = "http://www.alexa.com/topsites/countries"

# There're about 25 results per page, lets' get the top 100, so we only need to crawl 4 pages
PAGES_PER_COUNTRY = 4

# get the initial page where the countries are listed
countries_page = requests.get(COUNTRIES_PAGE_URL, headers=HEADERS)

if countries_page.status_code != 200:
    sys.exit("Ooopsie, page returned %s" % countries_page.status_code)

soup = BeautifulSoup(countries_page.text, "html.parser")

# get all country listings from the page and go through them
for link in soup.select(".countries li a"):
    # because we want to format our own country page (with the page number added)
    # we just need the country code
    # topsites/countries/AT
    #         split at / and get the last element of the list
    country_code = link.get('href').split('/')[-1]

    print("Getting country: {}".format(country_code))

    # we will store the top sites here, a list will do as we don't need any other info but the address
    country_top_sites = []

    # go through the pages, from zero to the PAGES_PER_COUNTRY
    # (I know shitty naming)
    for current_page_number in range(0, PAGES_PER_COUNTRY):
        # example URL: http://www.alexa.com/topsites/countries;2/AF
        country_page_url = "{}/topsites/countries;{}/{}".format(BASE_URL, current_page_number, country_code)
        print(country_page_url)

        # could get the requst out of the BS and do a check but eh
        soup = BeautifulSoup(requests.get(country_page_url, headers=HEADERS).text, 'html.parser')

        # go through all of the sites and add them to our previously set dict
        for _country in soup.select("div.listings li.site-listing .desc-container .desc-paragraph a"):
            site = _country.text.lower()
            country_top_sites.append(site)

    # after we went through all the country pages, just write the dict as JSON to it's own country file
    # to be later imported in the database
    # one could also import them directly from here
    with open('{}.json'.format(country_code), 'w+') as outfile:
            json.dump(country_top_sites, outfile)

    # don't make the target server mad at us, sleep you beauty
    time.sleep(WAIT_TIME)
	import requests
	import time
	import sys
	import json

	from bs4 import BeautifulSoup

	HEADERS = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
	}

	BASE_URL = "http://www.alexa.com"

	# In order to not make the target server mad at us, we'll sleep a bit after
	# each country we get, amount is in seconds
	WAIT_TIME = 5

	COUNTRIES_PAGE_URL = "http://www.alexa.com/topsites/countries"

	# There're about 25 results per page, lets' get the top 100, so we only need to crawl 4 pages
	PAGES_PER_COUNTRY = 4

	# get the initial page where the countries are listed
	countries_page = requests.get(COUNTRIES_PAGE_URL, headers=HEADERS)

	if countries_page.status_code != 200:
	sys.exit("Ooopsie, page returned %s" % countries_page.status_code)

	soup = BeautifulSoup(countries_page.text, "html.parser")

	# get all country listings from the page and go through them
	for link in soup.select(".countries li a"):
	# because we want to format our own country page (with the page number added)
	# we just need the country code
	# topsites/countries/AT
	# split at / and get the last element of the list
	country_code = link.get('href').split('/')[-1]

	print("Getting country: {}".format(country_code))

	# we will store the top sites here, a list will do as we don't need any other info but the address
	country_top_sites = []

	# go through the pages, from zero to the PAGES_PER_COUNTRY
	# (I know shitty naming)
	for current_page_number in range(0, PAGES_PER_COUNTRY):
	# example URL: http://www.alexa.com/topsites/countries;2/AF
	country_page_url = "{}/topsites/countries;{}/{}".format(BASE_URL, current_page_number, country_code)
	print(country_page_url)

	# could get the requst out of the BS and do a check but eh
	soup = BeautifulSoup(requests.get(country_page_url, headers=HEADERS).text, 'html.parser')

	# go through all of the sites and add them to our previously set dict
	for _country in soup.select("div.listings li.site-listing .desc-container .desc-paragraph a"):
	site = _country.text.lower()
	country_top_sites.append(site)

	# after we went through all the country pages, just write the dict as JSON to it's own country file
	# to be later imported in the database
	# one could also import them directly from here
	with open('{}.json'.format(country_code), 'w+') as outfile:
	json.dump(country_top_sites, outfile)

	# don't make the target server mad at us, sleep you beauty
	time.sleep(WAIT_TIME)