Skip to content

Instantly share code, notes, and snippets.

@Gabrock94
Last active November 10, 2017 11:03
Show Gist options
  • Save Gabrock94/66e6717eb583a55555c7597a58f558fa to your computer and use it in GitHub Desktop.
Save Gabrock94/66e6717eb583a55555c7597a58f558fa to your computer and use it in GitHub Desktop.
Script tht downloads votes and tags from AWWWARDS
# -*- coding: utf-8 -*-
"""
AWWWARD Web Scraper
Created on Thu Nov 9 14:32:36 2017
@author: Giulio Gabrieli
"""
###############################################################################
# #
# Libraries #
# #
###############################################################################
"""
"""
import os #used for reading files and directories
import requests #used to get web pages
from bs4 import BeautifulSoup #used to do scraping on web pages
import re #for regular expressions
import numpy as np
import pickle
###############################################################################
# #
# PARAMETERS #
# #
###############################################################################
"""
Here you can change the paramaters used in this script
"""
""" PATHS """
basepath = os.path.dirname(os.path.realpath(__file__)) #This get the basepath of the script
winnersURL = "https://www.awwwards.com/websites/sites_of_the_day/?page=" #base url of winners' pages
nominationURL = "https://www.awwwards.com/websites/nominees/?page=" #base url of nominations' pages
siteURL = "https://www.awwwards.com/sites/" #base url of websites
""" Here you can set the numner of pages to scrap for each category"""
numberOfPages = 30 #number of pages to scrap for each category
###############################################################################
# #
# MAIN #
# #
###############################################################################
"""
This sections handles our MAIN process
"""
if(__name__ == "__main__"):
""" NOMINATIONS """
""" First we create a List of all the nominations websites """
nominationsWebsites = []
for numberOfPage in range(1,numberOfPages+1):
print("Page number: ",numberOfPage) #print the current page number
page = requests.get(nominationURL + str(numberOfPage)) #get a page
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page
""" Here we look for the link to each nomination page"""
for ultag in soup.find_all('ul', {'class': 'list-items list-flex'}): #look for the unordered list containing the nominations
for litag in ultag.find_all('li'): #look for each element of the list
links = litag.find_all("a") #get the links
thisLink = ((links[0].get("href"))) #get the url of the links
if("/sites/" in thisLink):
thisLink = thisLink.replace("/sites/","")
nominationsWebsites.append(thisLink)
nominationsWebsites = set(nominationsWebsites) #eliminate duplicates
nominationDatabase = []
""" now we parse each website and we get basic informations and informations on the votes """
for website in nominationsWebsites:
print(website)
try:
pageURL = siteURL + website
page = requests.get(pageURL) #get a page
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page
websiteData = {} #initialize a dictionary
websiteData["Name"] = soup.find_all('h1', {'class': 'heading-medium'})[0].get_text() #get the name of the website
authorParagraph = soup.find_all('p', {'class': ''})[0].get_text() #get the author of the website
""" we need to clean it up """
patter = r'.*?\n(.*).\n.*'
match = re.search(patter, authorParagraph)
websiteData["Author"] = match.group(1)
websiteData["Link"] = soup.find_all('a', {'class': 'item-link'})[0].get("href") #get the URL of the website
websiteData["Tag"] = [li.get_text() for li in soup.find_all("div",{'class': 'list-tags'})[0].find_all('li') if li.get_text() != "\nsites\n"] #get the tags associated to the website
""" Then we get the users votes """
design = []
usability = []
creativity = []
content = []
#TODO: Currently it takes up to 20 voters. Need a fix to load all the votes
votes = soup.find_all('ul',{'class': 'list-circle-notes js-circle-notes'}) #get the list of Voters
for vote in votes: #for each vote we take the four paramaters and we store it inot a list
design.append(vote.find_all('li',{'class':'design'})[0].get('data-note'))
usability.append(vote.find_all('li',{'class':'usability'})[0].get('data-note'))
creativity.append(vote.find_all('li',{'class':'creativity'})[0].get('data-note'))
content.append(vote.find_all('li',{'class':'content'})[0].get('data-note'))
""" for each parameter, we save the mean, std and list """
websiteData["Votes"] = {}
websiteData["Votes"]["Design"] = {"mean": np.mean([float(x) for x in design]), "std":np.std([float(x) for x in design]),"votes": design}
websiteData["Votes"]["Usability"] = {"mean": np.mean([float(x) for x in usability]), "std":np.std([float(x) for x in design]),"votes": usability}
websiteData["Votes"]["Creativity"] = {"mean": np.mean([float(x) for x in creativity]), "std":np.std([float(x) for x in design]),"votes": creativity}
websiteData["Votes"]["Content"] = {"mean": np.mean([float(x) for x in content]), "std":np.std([float(x) for x in design]),"votes": content}
nominationDatabase.append(websiteData)
except:
print("Skipped")
with open(basepath + '/nominationWebsites.pkl', 'wb') as f:
pickle.dump(nominationDatabase, f)
""" WINNERS """
""" First we create a List of all the nominations websites """
winnersWebsites = []
for numberOfPage in range(1,numberOfPages+1):
print("Page number: ",numberOfPage) #print the current page number
page = requests.get(winnersURL + str(numberOfPage)) #get a page
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page
""" Here we look for the link to each nomination page"""
for ultag in soup.find_all('ul', {'class': 'list-items list-flex'}): #look for the unordered list containing the nominations
for litag in ultag.find_all('li'): #look for each element of the list
links = litag.find_all("a") #get the links
thisLink = ((links[0].get("href"))) #get the url of the links
if("/sites/" in thisLink):
thisLink = thisLink.replace("/sites/","")
winnersWebsites.append(thisLink)
winnersWebsites = set(winnersWebsites) #eliminate duplicates
winnersDatabase = []
""" now we parse each website and we get basic informations and informations on the votes """
for website in winnersWebsites:
print(website)
try:
pageURL = siteURL + website
page = requests.get(pageURL) #get a page
soup = BeautifulSoup(page.text, 'html.parser') #get the HTML of the page
websiteData = {} #initialize a dictionary
websiteData["Name"] = soup.find_all('h1', {'class': 'heading-large'})[0].get_text() #get the name of the website
authorParagraph = soup.find_all('div', {'class': 'by'})[0].find_all('strong')[0].get_text() #get the author of the website
""" we need to clean it up """
pattern = r'.*?\n(.*)\n.*'
match = re.search(pattern, authorParagraph)
websiteData["Author"] = match.group(1)
websiteData["Link"] = soup.find_all('a', {'class': 'item-link'})[0].get("href") #get the URL of the website
websiteData["Tag"] = [li.get_text() for li in soup.find_all("div",{'class': 'list-tags'})[0].find_all('li') if li.get_text() != "\nsites\n"] #get the tags associated to the website
""" Then we get the users votes """
design = []
usability = []
creativity = []
content = []
#TODO: Currently it takes up to 20 voters. Need a fix to load all the votes
votes = soup.find_all('ul',{'class': 'list-circle-notes js-circle-notes'}) #get the list of Voters
for vote in votes: #for each vote we take the four paramaters and we store it inot a list
design.append(vote.find_all('li',{'class':'design'})[0].get('data-note'))
usability.append(vote.find_all('li',{'class':'usability'})[0].get('data-note'))
creativity.append(vote.find_all('li',{'class':'creativity'})[0].get('data-note'))
content.append(vote.find_all('li',{'class':'content'})[0].get('data-note'))
""" for each parameter, we save the mean, std and list """
websiteData["Votes"] = {}
websiteData["Votes"]["Design"] = {"mean": np.mean([float(x) for x in design]), "std":np.std([float(x) for x in design]),"votes": design}
websiteData["Votes"]["Usability"] = {"mean": np.mean([float(x) for x in usability]), "std":np.std([float(x) for x in design]),"votes": usability}
websiteData["Votes"]["Creativity"] = {"mean": np.mean([float(x) for x in creativity]), "std":np.std([float(x) for x in design]),"votes": creativity}
websiteData["Votes"]["Content"] = {"mean": np.mean([float(x) for x in content]), "std":np.std([float(x) for x in design]),"votes": content}
winnersDatabase.append(websiteData)
except:
print("Skipped")
with open(basepath + '/winnersDatabase.pkl', 'wb') as f:
pickle.dump(winnersDatabase, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment