Skip to content

Instantly share code, notes, and snippets.

Last active November 10, 2017 11:03
Show Gist options
  • Save Gabrock94/66e6717eb583a55555c7597a58f558fa to your computer and use it in GitHub Desktop.
Save Gabrock94/66e6717eb583a55555c7597a58f558fa to your computer and use it in GitHub Desktop.
Script tht downloads votes and tags from AWWWARDS
# -*- coding: utf-8 -*-
AWWWARD Web Scraper
Created on Thu Nov 9 14:32:36 2017
@author: Giulio Gabrieli
# #
# Libraries #
# #
import os #used for reading files and directories
import requests #used to get web pages
from bs4 import BeautifulSoup #used to do scraping on web pages
import re #for regular expressions
import numpy as np
import pickle
# #
# #
Here you can change the paramaters used in this script
""" PATHS """
basepath = os.path.dirname(os.path.realpath(__file__)) #This get the basepath of the script
winnersURL = "" #base url of winners' pages
nominationURL = "" #base url of nominations' pages
siteURL = "" #base url of websites
""" Here you can set the numner of pages to scrap for each category"""
numberOfPages = 30 #number of pages to scrap for each category
# #
# MAIN #
# #
This sections handles our MAIN process
if(__name__ == "__main__"):
""" First we create a List of all the nominations websites """
nominationsWebsites = []
for numberOfPage in range(1,numberOfPages+1):
print("Page number: ",numberOfPage) #print the current page number
page = requests.get(nominationURL + str(numberOfPage)) #get a page
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page
""" Here we look for the link to each nomination page"""
for ultag in soup.find_all('ul', {'class': 'list-items list-flex'}): #look for the unordered list containing the nominations
for litag in ultag.find_all('li'): #look for each element of the list
links = litag.find_all("a") #get the links
thisLink = ((links[0].get("href"))) #get the url of the links
if("/sites/" in thisLink):
thisLink = thisLink.replace("/sites/","")
nominationsWebsites = set(nominationsWebsites) #eliminate duplicates
nominationDatabase = []
""" now we parse each website and we get basic informations and informations on the votes """
for website in nominationsWebsites:
pageURL = siteURL + website
page = requests.get(pageURL) #get a page
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page
websiteData = {} #initialize a dictionary
websiteData["Name"] = soup.find_all('h1', {'class': 'heading-medium'})[0].get_text() #get the name of the website
authorParagraph = soup.find_all('p', {'class': ''})[0].get_text() #get the author of the website
""" we need to clean it up """
patter = r'.*?\n(.*).\n.*'
match =, authorParagraph)
websiteData["Author"] =
websiteData["Link"] = soup.find_all('a', {'class': 'item-link'})[0].get("href") #get the URL of the website
websiteData["Tag"] = [li.get_text() for li in soup.find_all("div",{'class': 'list-tags'})[0].find_all('li') if li.get_text() != "\nsites\n"] #get the tags associated to the website
""" Then we get the users votes """
design = []
usability = []
creativity = []
content = []
#TODO: Currently it takes up to 20 voters. Need a fix to load all the votes
votes = soup.find_all('ul',{'class': 'list-circle-notes js-circle-notes'}) #get the list of Voters
for vote in votes: #for each vote we take the four paramaters and we store it inot a list
""" for each parameter, we save the mean, std and list """
websiteData["Votes"] = {}
websiteData["Votes"]["Design"] = {"mean": np.mean([float(x) for x in design]), "std":np.std([float(x) for x in design]),"votes": design}
websiteData["Votes"]["Usability"] = {"mean": np.mean([float(x) for x in usability]), "std":np.std([float(x) for x in design]),"votes": usability}
websiteData["Votes"]["Creativity"] = {"mean": np.mean([float(x) for x in creativity]), "std":np.std([float(x) for x in design]),"votes": creativity}
websiteData["Votes"]["Content"] = {"mean": np.mean([float(x) for x in content]), "std":np.std([float(x) for x in design]),"votes": content}
with open(basepath + '/nominationWebsites.pkl', 'wb') as f:
pickle.dump(nominationDatabase, f)
""" WINNERS """
""" First we create a List of all the nominations websites """
winnersWebsites = []
for numberOfPage in range(1,numberOfPages+1):
print("Page number: ",numberOfPage) #print the current page number
page = requests.get(winnersURL + str(numberOfPage)) #get a page
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page
""" Here we look for the link to each nomination page"""
for ultag in soup.find_all('ul', {'class': 'list-items list-flex'}): #look for the unordered list containing the nominations
for litag in ultag.find_all('li'): #look for each element of the list
links = litag.find_all("a") #get the links
thisLink = ((links[0].get("href"))) #get the url of the links
if("/sites/" in thisLink):
thisLink = thisLink.replace("/sites/","")
winnersWebsites = set(winnersWebsites) #eliminate duplicates
winnersDatabase = []
""" now we parse each website and we get basic informations and informations on the votes """
for website in winnersWebsites:
pageURL = siteURL + website
page = requests.get(pageURL) #get a page
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page
websiteData = {} #initialize a dictionary
websiteData["Name"] = soup.find_all('h1', {'class': 'heading-large'})[0].get_text() #get the name of the website
authorParagraph = soup.find_all('div', {'class': 'by'})[0].find_all('strong')[0].get_text() #get the author of the website
""" we need to clean it up """
pattern = r'.*?\n(.*)\n.*'
match =, authorParagraph)
websiteData["Author"] =
websiteData["Link"] = soup.find_all('a', {'class': 'item-link'})[0].get("href") #get the URL of the website
websiteData["Tag"] = [li.get_text() for li in soup.find_all("div",{'class': 'list-tags'})[0].find_all('li') if li.get_text() != "\nsites\n"] #get the tags associated to the website
""" Then we get the users votes """
design = []
usability = []
creativity = []
content = []
#TODO: Currently it takes up to 20 voters. Need a fix to load all the votes
votes = soup.find_all('ul',{'class': 'list-circle-notes js-circle-notes'}) #get the list of Voters
for vote in votes: #for each vote we take the four paramaters and we store it inot a list
""" for each parameter, we save the mean, std and list """
websiteData["Votes"] = {}
websiteData["Votes"]["Design"] = {"mean": np.mean([float(x) for x in design]), "std":np.std([float(x) for x in design]),"votes": design}
websiteData["Votes"]["Usability"] = {"mean": np.mean([float(x) for x in usability]), "std":np.std([float(x) for x in design]),"votes": usability}
websiteData["Votes"]["Creativity"] = {"mean": np.mean([float(x) for x in creativity]), "std":np.std([float(x) for x in design]),"votes": creativity}
websiteData["Votes"]["Content"] = {"mean": np.mean([float(x) for x in content]), "std":np.std([float(x) for x in design]),"votes": content}
with open(basepath + '/winnersDatabase.pkl', 'wb') as f:
pickle.dump(winnersDatabase, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment