nrathnam/scraping_page.py

## scraping_page.py
#Python code for scraping Economist website and fetching Business school ranking data
#using Beautiful Soup
import urllib2
import re, requests
from bs4 import BeautifulSoup
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Chrome/52.0')]
university = []
#Extract the first level of information about the university B'schools, ranks and their locations
for i in range(10):
    if (i == 0):
        url = "http://www.economist.com/whichmba/full-time-mba-ranking"
    else:
        url = "http://www.economist.com/whichmba/full-time-mba-ranking?page=" + str(i)
    response = opener.open(url)
    page = response.read()
    university.append(BeautifulSoup(page))

#Extract the details regarding the university Business school name, its rank and location
#First extract all the tags
univRankList = []
univSchoolList = []
univLocationList = []
for i in range(10):
    univrank = university[i].find_all('td', {'class': "views-field views-field-field-wmba-school-rank-overall-value active"})
    univRankList.append(univrank)
    univSchool = university[i].find_all('td', {'class': "views-field views-field-field-wmba-school-name-alpha-value"})
    univSchoolList.append(univSchool)
    univLocation = university[i].find_all('td', {'class': "views-field views-field-name"})
    univLocationList.append(univLocation)

univ_school = []; univ_rank =[]; univ_location = []; univ_school_url = []

#Once the tags are extracted, fetch the value contained within those tags
for i in range(10):
    univ_school.append([tag.get_text().strip().encode('utf-8') for tag in univSchoolList[i]])
    univ_rank.append([tag.get_text().strip().encode('utf-8') for tag in univRankList[i]])
    univ_location.append([tag.get_text().strip().encode('utf-8') for tag in univLocationList[i]])
    for j in range(10):
        univ_school_url.append(univSchoolList[i][j].a.get('href'))

# Flatten the list of lists by performing  an aggregate function and return the result as a simple list
univ_school = sum(univ_school,[])
univ_rank = sum(univ_rank,[])
univ_location = sum(univ_location,[])

#Concaatenate the data into a single dataframe
school_rank_location = pd.DataFrame([univ_school,univ_rank,univ_location], index=['School Name','Rank','Location'])
school_rank_location = school_rank_location.T
	#Python code for scraping Economist website and fetching Business school ranking data
	#using Beautiful Soup
	import urllib2
	import re, requests
	from bs4 import BeautifulSoup
	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Chrome/52.0')]
	university = []
	#Extract the first level of information about the university B'schools, ranks and their locations
	for i in range(10):
	if (i == 0):
	url = "http://www.economist.com/whichmba/full-time-mba-ranking"
	else:
	url = "http://www.economist.com/whichmba/full-time-mba-ranking?page=" + str(i)
	response = opener.open(url)
	page = response.read()
	university.append(BeautifulSoup(page))

	#Extract the details regarding the university Business school name, its rank and location
	#First extract all the tags
	univRankList = []
	univSchoolList = []
	univLocationList = []
	for i in range(10):
	univrank = university[i].find_all('td', {'class': "views-field views-field-field-wmba-school-rank-overall-value active"})
	univRankList.append(univrank)
	univSchool = university[i].find_all('td', {'class': "views-field views-field-field-wmba-school-name-alpha-value"})
	univSchoolList.append(univSchool)
	univLocation = university[i].find_all('td', {'class': "views-field views-field-name"})
	univLocationList.append(univLocation)

	univ_school = []; univ_rank =[]; univ_location = []; univ_school_url = []

	#Once the tags are extracted, fetch the value contained within those tags
	for i in range(10):
	univ_school.append([tag.get_text().strip().encode('utf-8') for tag in univSchoolList[i]])
	univ_rank.append([tag.get_text().strip().encode('utf-8') for tag in univRankList[i]])
	univ_location.append([tag.get_text().strip().encode('utf-8') for tag in univLocationList[i]])
	for j in range(10):
	univ_school_url.append(univSchoolList[i][j].a.get('href'))

	# Flatten the list of lists by performing an aggregate function and return the result as a simple list
	univ_school = sum(univ_school,[])
	univ_rank = sum(univ_rank,[])
	univ_location = sum(univ_location,[])

	#Concaatenate the data into a single dataframe
	school_rank_location = pd.DataFrame([univ_school,univ_rank,univ_location], index=['School Name','Rank','Location'])
	school_rank_location = school_rank_location.T