Skip to content

Instantly share code, notes, and snippets.

@nrathnam
Last active August 25, 2016 20:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nrathnam/708d04ac31ef714f19ef412e1a0cc277 to your computer and use it in GitHub Desktop.
Save nrathnam/708d04ac31ef714f19ef412e1a0cc277 to your computer and use it in GitHub Desktop.
#Python code for scraping Economist website and fetching Business school ranking data
#using Beautiful Soup
import urllib2
import re, requests
from bs4 import BeautifulSoup
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Chrome/52.0')]
university = []
#Extract the first level of information about the university B'schools, ranks and their locations
for i in range(10):
if (i == 0):
url = "http://www.economist.com/whichmba/full-time-mba-ranking"
else:
url = "http://www.economist.com/whichmba/full-time-mba-ranking?page=" + str(i)
response = opener.open(url)
page = response.read()
university.append(BeautifulSoup(page))
#Extract the details regarding the university Business school name, its rank and location
#First extract all the tags
univRankList = []
univSchoolList = []
univLocationList = []
for i in range(10):
univrank = university[i].find_all('td', {'class': "views-field views-field-field-wmba-school-rank-overall-value active"})
univRankList.append(univrank)
univSchool = university[i].find_all('td', {'class': "views-field views-field-field-wmba-school-name-alpha-value"})
univSchoolList.append(univSchool)
univLocation = university[i].find_all('td', {'class': "views-field views-field-name"})
univLocationList.append(univLocation)
univ_school = []; univ_rank =[]; univ_location = []; univ_school_url = []
#Once the tags are extracted, fetch the value contained within those tags
for i in range(10):
univ_school.append([tag.get_text().strip().encode('utf-8') for tag in univSchoolList[i]])
univ_rank.append([tag.get_text().strip().encode('utf-8') for tag in univRankList[i]])
univ_location.append([tag.get_text().strip().encode('utf-8') for tag in univLocationList[i]])
for j in range(10):
univ_school_url.append(univSchoolList[i][j].a.get('href'))
# Flatten the list of lists by performing an aggregate function and return the result as a simple list
univ_school = sum(univ_school,[])
univ_rank = sum(univ_rank,[])
univ_location = sum(univ_location,[])
#Concaatenate the data into a single dataframe
school_rank_location = pd.DataFrame([univ_school,univ_rank,univ_location], index=['School Name','Rank','Location'])
school_rank_location = school_rank_location.T
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment