Skip to content

Instantly share code, notes, and snippets.

@MichaelCurrie
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MichaelCurrie/ad61f6fe9c09ba999bb5 to your computer and use it in GitHub Desktop.
Save MichaelCurrie/ad61f6fe9c09ba999bb5 to your computer and use it in GitHub Desktop.
Lennon vs McCartney Screen-Scraping Utility
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 22 22:26:53 2014
Scrape a website to find content on Beatles Rankings
@author: @MichaelCurrie
http://michaelcurrie.com/blog/2014/7/7/john-lennon-vs-paul-mccartney-a-quantitative-analysis
"""
import operator
import urllib # for URL loading
import csv
# On a tip from a StackOverflow question, I am using Leonard Richarson's
# free HTML scraping utility, BeautifulSoup.
# http://www.crummy.com/software/BeautifulSoup/
from bs4 import BeautifulSoup
# I found this from a Google search for "greatest beatles songs ranked"
url = 'http://www.ranker.com/crowdranked-list/best-beatles-songs' + \
'?var=2&utm_expid=16418821-27.IWS5qTkdT1y0VVg72wOjSw.1&page='
song_rank_dictionary = {}
# The list of all beatles songs is spread through four pages. The url
# for each page just has page=1, page=2, etc, at the end of the string
for page in range(1,5):
response = urllib.request.urlopen(url + str(page))
html = response.read()
hs = html.decode('utf-8')
# The file has a bunch of unaccompanied </br> tags in the middle of the
# song list, so we have to remove these or Beautiful Soup will cut off
# the song list in the middle of the first entry, when it encounters
# a </br> tag.
hs = hs.replace('</br>', '')
soup = BeautifulSoup(hs)
# I looked through the source code by hitting F12 in Chrome and I
# found that the songs are all contained in the Ordered List
# (i.e. 'ol' tag) with id='mainList'
song_list = soup.find_all('ol', attrs='mainList')[0]
# Traverse the list of songs
for song in song_list.children:
# If the song is anything other than having an id of n_64717298,
# n_64248301, etc., go to the next item
if(len(str(song)) < 2):
continue
id_code = str(song['id']).strip()
if(len(id_code) < 2 or id_code[:2] != 'n_'):
continue
rank_element = song.find_all('span', attrs={'class':'rank tableAlign'})[0]
rank = int(rank_element.string)
song_name_element = song.find_all('span', attrs={'class':'oNode'})[0]
song_name = song_name_element.string
# Add the song only if it's not a duplicate
# i.e. avoid the case of 'And I Love Her', which appears twice (at ranks
# 60 and 91)
if(not song_name in song_rank_dictionary):
song_rank_dictionary[song_name] = rank
# Using advanced semantics from http://stackoverflow.com/questions/613183/
# we get a list representation of the dictionary, ordered (since a dict
# is inherently orderless). We order by the rank, which happens to be the
# dict's value entries, not the key, so we use itemgetter(1).
sorted_song_list = sorted(iter(song_rank_dictionary.items()),
key=operator.itemgetter(1))
# Write the results to a CSV file
with open('beatles rankings.csv', 'w') as csv_file:
# We need to specify that the line terminator is '\n' since otherwise
# writer will use the default carriage return '\r\n', which will be
# interpreted by Excel as a blank row between all our rows!
writer = csv.writer(csv_file, lineterminator='\n')
# Write our column headers
writer.writerow(['Rank', 'Song'])
for song, rank in sorted_song_list:
# Suppress unrecognizable characters by forcing a conversion to ASCII
# with the instruction to ignore any non-ASCII characters
# Otherwise our CSV writer will throw a UnicodeEncodeError exception
# and the program will halt prematurely
song = str(song).encode('ascii', 'ignore')
rank = str(rank).encode('ascii', 'ignore')
song = song.decode('utf-8')
rank = rank.decode('utf-8')
# Write the dictionary entry as a row
writer.writerow([rank, song])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment