Last active
August 29, 2015 14:03
-
-
Save MichaelCurrie/ad61f6fe9c09ba999bb5 to your computer and use it in GitHub Desktop.
Lennon vs McCartney Screen-Scraping Utility
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sun Jun 22 22:26:53 2014 | |
Scrape a website to find content on Beatles Rankings | |
@author: @MichaelCurrie | |
http://michaelcurrie.com/blog/2014/7/7/john-lennon-vs-paul-mccartney-a-quantitative-analysis | |
""" | |
import operator | |
import urllib # for URL loading | |
import csv | |
# On a tip from a StackOverflow question, I am using Leonard Richarson's | |
# free HTML scraping utility, BeautifulSoup. | |
# http://www.crummy.com/software/BeautifulSoup/ | |
from bs4 import BeautifulSoup | |
# I found this from a Google search for "greatest beatles songs ranked" | |
url = 'http://www.ranker.com/crowdranked-list/best-beatles-songs' + \ | |
'?var=2&utm_expid=16418821-27.IWS5qTkdT1y0VVg72wOjSw.1&page=' | |
song_rank_dictionary = {} | |
# The list of all beatles songs is spread through four pages. The url | |
# for each page just has page=1, page=2, etc, at the end of the string | |
for page in range(1,5): | |
response = urllib.request.urlopen(url + str(page)) | |
html = response.read() | |
hs = html.decode('utf-8') | |
# The file has a bunch of unaccompanied </br> tags in the middle of the | |
# song list, so we have to remove these or Beautiful Soup will cut off | |
# the song list in the middle of the first entry, when it encounters | |
# a </br> tag. | |
hs = hs.replace('</br>', '') | |
soup = BeautifulSoup(hs) | |
# I looked through the source code by hitting F12 in Chrome and I | |
# found that the songs are all contained in the Ordered List | |
# (i.e. 'ol' tag) with id='mainList' | |
song_list = soup.find_all('ol', attrs='mainList')[0] | |
# Traverse the list of songs | |
for song in song_list.children: | |
# If the song is anything other than having an id of n_64717298, | |
# n_64248301, etc., go to the next item | |
if(len(str(song)) < 2): | |
continue | |
id_code = str(song['id']).strip() | |
if(len(id_code) < 2 or id_code[:2] != 'n_'): | |
continue | |
rank_element = song.find_all('span', attrs={'class':'rank tableAlign'})[0] | |
rank = int(rank_element.string) | |
song_name_element = song.find_all('span', attrs={'class':'oNode'})[0] | |
song_name = song_name_element.string | |
# Add the song only if it's not a duplicate | |
# i.e. avoid the case of 'And I Love Her', which appears twice (at ranks | |
# 60 and 91) | |
if(not song_name in song_rank_dictionary): | |
song_rank_dictionary[song_name] = rank | |
# Using advanced semantics from http://stackoverflow.com/questions/613183/ | |
# we get a list representation of the dictionary, ordered (since a dict | |
# is inherently orderless). We order by the rank, which happens to be the | |
# dict's value entries, not the key, so we use itemgetter(1). | |
sorted_song_list = sorted(iter(song_rank_dictionary.items()), | |
key=operator.itemgetter(1)) | |
# Write the results to a CSV file | |
with open('beatles rankings.csv', 'w') as csv_file: | |
# We need to specify that the line terminator is '\n' since otherwise | |
# writer will use the default carriage return '\r\n', which will be | |
# interpreted by Excel as a blank row between all our rows! | |
writer = csv.writer(csv_file, lineterminator='\n') | |
# Write our column headers | |
writer.writerow(['Rank', 'Song']) | |
for song, rank in sorted_song_list: | |
# Suppress unrecognizable characters by forcing a conversion to ASCII | |
# with the instruction to ignore any non-ASCII characters | |
# Otherwise our CSV writer will throw a UnicodeEncodeError exception | |
# and the program will halt prematurely | |
song = str(song).encode('ascii', 'ignore') | |
rank = str(rank).encode('ascii', 'ignore') | |
song = song.decode('utf-8') | |
rank = rank.decode('utf-8') | |
# Write the dictionary entry as a row | |
writer.writerow([rank, song]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment