MichaelCurrie/BeatlesWebScrape.py

## BeatlesWebScrape.py
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 22 22:26:53 2014

Scrape a website to find content on Beatles Rankings

@author: @MichaelCurrie

http://michaelcurrie.com/blog/2014/7/7/john-lennon-vs-paul-mccartney-a-quantitative-analysis

"""
import operator
import urllib      # for URL loading
import csv
# On a tip from a StackOverflow question, I am using Leonard Richarson's
# free HTML scraping utility, BeautifulSoup.
# http://www.crummy.com/software/BeautifulSoup/
from bs4 import BeautifulSoup

# I found this from a Google search for "greatest beatles songs ranked"
url  = 'http://www.ranker.com/crowdranked-list/best-beatles-songs' + \
       '?var=2&utm_expid=16418821-27.IWS5qTkdT1y0VVg72wOjSw.1&page='

song_rank_dictionary = {}

# The list of all beatles songs is spread through four pages.  The url
# for each page just has page=1, page=2, etc, at the end of the string
for page in range(1,5):
  response = urllib.request.urlopen(url + str(page))
  html = response.read()
  hs = html.decode('utf-8')

  # The file has a bunch of unaccompanied </br> tags in the middle of the
  # song list, so we have to remove these or Beautiful Soup will cut off
  # the song list in the middle of the first entry, when it encounters
  # a </br> tag.
  hs = hs.replace('</br>', '')

  soup = BeautifulSoup(hs)

  # I looked through the source code by hitting F12 in Chrome and I
  # found that the songs are all contained in the Ordered List
  # (i.e. 'ol' tag) with id='mainList'
  song_list = soup.find_all('ol', attrs='mainList')[0]

  # Traverse the list of songs
  for song in song_list.children:
    # If the song is anything other than having an id of n_64717298,
    # n_64248301, etc., go to the next item
    if(len(str(song)) < 2):
      continue

    id_code = str(song['id']).strip()

    if(len(id_code) < 2 or id_code[:2] != 'n_'):
      continue

    rank_element = song.find_all('span', attrs={'class':'rank tableAlign'})[0]

    rank = int(rank_element.string)

    song_name_element = song.find_all('span', attrs={'class':'oNode'})[0]

    song_name = song_name_element.string

    # Add the song only if it's not a duplicate
    #   i.e. avoid the case of 'And I Love Her', which appears twice (at ranks
    #        60 and 91)
    if(not song_name in song_rank_dictionary):
      song_rank_dictionary[song_name] = rank

# Using advanced semantics from http://stackoverflow.com/questions/613183/
# we get a list representation of the dictionary, ordered (since a dict
# is inherently orderless).  We order by the rank, which happens to be the
# dict's value entries, not the key, so we use itemgetter(1).
sorted_song_list = sorted(iter(song_rank_dictionary.items()),
                          key=operator.itemgetter(1))

# Write the results to a CSV file
with open('beatles rankings.csv', 'w') as csv_file:
  # We need to specify that the line terminator is '\n' since otherwise
  # writer will use the default carriage return '\r\n', which will be
  # interpreted by Excel as a blank row between all our rows!
  writer = csv.writer(csv_file, lineterminator='\n')

  # Write our column headers
  writer.writerow(['Rank', 'Song'])

  for song, rank in sorted_song_list:
    # Suppress unrecognizable characters by forcing a conversion to ASCII
    # with the instruction to ignore any non-ASCII characters
    # Otherwise our CSV writer will throw a UnicodeEncodeError exception
    # and the program will halt prematurely
    song = str(song).encode('ascii', 'ignore')
    rank = str(rank).encode('ascii', 'ignore')
    song = song.decode('utf-8')
    rank = rank.decode('utf-8')

    # Write the dictionary entry as a row
    writer.writerow([rank, song])
	# -- coding: utf-8 --
	"""
	Created on Sun Jun 22 22:26:53 2014

	Scrape a website to find content on Beatles Rankings

	@author: @MichaelCurrie

	http://michaelcurrie.com/blog/2014/7/7/john-lennon-vs-paul-mccartney-a-quantitative-analysis

	"""
	import operator
	import urllib # for URL loading
	import csv
	# On a tip from a StackOverflow question, I am using Leonard Richarson's
	# free HTML scraping utility, BeautifulSoup.
	# http://www.crummy.com/software/BeautifulSoup/
	from bs4 import BeautifulSoup

	# I found this from a Google search for "greatest beatles songs ranked"
	url = 'http://www.ranker.com/crowdranked-list/best-beatles-songs' + \
	'?var=2&utm_expid=16418821-27.IWS5qTkdT1y0VVg72wOjSw.1&page='

	song_rank_dictionary = {}

	# The list of all beatles songs is spread through four pages. The url
	# for each page just has page=1, page=2, etc, at the end of the string
	for page in range(1,5):
	response = urllib.request.urlopen(url + str(page))
	html = response.read()
	hs = html.decode('utf-8')

	# The file has a bunch of unaccompanied </br> tags in the middle of the
	# song list, so we have to remove these or Beautiful Soup will cut off
	# the song list in the middle of the first entry, when it encounters
	# a </br> tag.
	hs = hs.replace('</br>', '')

	soup = BeautifulSoup(hs)

	# I looked through the source code by hitting F12 in Chrome and I
	# found that the songs are all contained in the Ordered List
	# (i.e. 'ol' tag) with id='mainList'
	song_list = soup.find_all('ol', attrs='mainList')[0]

	# Traverse the list of songs
	for song in song_list.children:
	# If the song is anything other than having an id of n_64717298,
	# n_64248301, etc., go to the next item
	if(len(str(song)) < 2):
	continue

	id_code = str(song['id']).strip()

	if(len(id_code) < 2 or id_code[:2] != 'n_'):
	continue

	rank_element = song.find_all('span', attrs={'class':'rank tableAlign'})[0]

	rank = int(rank_element.string)

	song_name_element = song.find_all('span', attrs={'class':'oNode'})[0]

	song_name = song_name_element.string

	# Add the song only if it's not a duplicate
	# i.e. avoid the case of 'And I Love Her', which appears twice (at ranks
	# 60 and 91)
	if(not song_name in song_rank_dictionary):
	song_rank_dictionary[song_name] = rank

	# Using advanced semantics from http://stackoverflow.com/questions/613183/
	# we get a list representation of the dictionary, ordered (since a dict
	# is inherently orderless). We order by the rank, which happens to be the
	# dict's value entries, not the key, so we use itemgetter(1).
	sorted_song_list = sorted(iter(song_rank_dictionary.items()),
	key=operator.itemgetter(1))

	# Write the results to a CSV file
	with open('beatles rankings.csv', 'w') as csv_file:
	# We need to specify that the line terminator is '\n' since otherwise
	# writer will use the default carriage return '\r\n', which will be
	# interpreted by Excel as a blank row between all our rows!
	writer = csv.writer(csv_file, lineterminator='\n')

	# Write our column headers
	writer.writerow(['Rank', 'Song'])

	for song, rank in sorted_song_list:
	# Suppress unrecognizable characters by forcing a conversion to ASCII
	# with the instruction to ignore any non-ASCII characters
	# Otherwise our CSV writer will throw a UnicodeEncodeError exception
	# and the program will halt prematurely
	song = str(song).encode('ascii', 'ignore')
	rank = str(rank).encode('ascii', 'ignore')
	song = song.decode('utf-8')
	rank = rank.decode('utf-8')

	# Write the dictionary entry as a row
	writer.writerow([rank, song])