Last active
August 25, 2016 20:05
-
-
Save nrathnam/708d04ac31ef714f19ef412e1a0cc277 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Python code for scraping Economist website and fetching Business school ranking data | |
#using Beautiful Soup | |
import urllib2 | |
import re, requests | |
from bs4 import BeautifulSoup | |
opener = urllib2.build_opener() | |
opener.addheaders = [('User-agent', 'Chrome/52.0')] | |
university = [] | |
#Extract the first level of information about the university B'schools, ranks and their locations | |
for i in range(10): | |
if (i == 0): | |
url = "http://www.economist.com/whichmba/full-time-mba-ranking" | |
else: | |
url = "http://www.economist.com/whichmba/full-time-mba-ranking?page=" + str(i) | |
response = opener.open(url) | |
page = response.read() | |
university.append(BeautifulSoup(page)) | |
#Extract the details regarding the university Business school name, its rank and location | |
#First extract all the tags | |
univRankList = [] | |
univSchoolList = [] | |
univLocationList = [] | |
for i in range(10): | |
univrank = university[i].find_all('td', {'class': "views-field views-field-field-wmba-school-rank-overall-value active"}) | |
univRankList.append(univrank) | |
univSchool = university[i].find_all('td', {'class': "views-field views-field-field-wmba-school-name-alpha-value"}) | |
univSchoolList.append(univSchool) | |
univLocation = university[i].find_all('td', {'class': "views-field views-field-name"}) | |
univLocationList.append(univLocation) | |
univ_school = []; univ_rank =[]; univ_location = []; univ_school_url = [] | |
#Once the tags are extracted, fetch the value contained within those tags | |
for i in range(10): | |
univ_school.append([tag.get_text().strip().encode('utf-8') for tag in univSchoolList[i]]) | |
univ_rank.append([tag.get_text().strip().encode('utf-8') for tag in univRankList[i]]) | |
univ_location.append([tag.get_text().strip().encode('utf-8') for tag in univLocationList[i]]) | |
for j in range(10): | |
univ_school_url.append(univSchoolList[i][j].a.get('href')) | |
# Flatten the list of lists by performing an aggregate function and return the result as a simple list | |
univ_school = sum(univ_school,[]) | |
univ_rank = sum(univ_rank,[]) | |
univ_location = sum(univ_location,[]) | |
#Concaatenate the data into a single dataframe | |
school_rank_location = pd.DataFrame([univ_school,univ_rank,univ_location], index=['School Name','Rank','Location']) | |
school_rank_location = school_rank_location.T | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment