Skip to content

Instantly share code, notes, and snippets.

@vinovator
Created March 29, 2017 09:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vinovator/18c681a71c356ce42e9473c3d9ed579d to your computer and use it in GitHub Desktop.
Save vinovator/18c681a71c356ce42e9473c3d9ed579d to your computer and use it in GitHub Desktop.
# uk_mba.py
# Python 2.7.6
"""
Extract business schools in UK with AACSB, AMBA and/or EQUIS accredition only
Scapring from http://find-mba.com/
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl as op
base_url = "http://find-mba.com"
start_url = "/schools/uk-ireland/uk?keyword=&rank=false&accredition=true&cities=&specs=&sort=popularity&numberperpage=50&page=1#list-school"
xl_name = "UK Bschools.xlsx"
sheet_name = "info"
# Declaring the list as public parameter to avoid being overwritten
# with each recursive call
school_list = list()
def get_basic_info(url):
"""
Fetch basic bschool information
"""
print("Fetching basic info from " + url)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "lxml")
# print soup.title.string
# print soup.prettify()
school_div = soup.findAll("div", {"class": "row school-list-item"})
for school in school_div:
school_dict = dict()
row = school.find("div", {"class": "row"})
title_div = row.find("div", {"class": "col-xs-11 school-list-title"})
school_dict["Name"] = title_div("a")[0].get("title").encode("utf-8")
school_dict["URL"] = title_div("a")[0].get("href")
detail = school.find("div", {"class": "school-list-details"})
location_span = detail.find("span", {"class": "school-list-location"})
school_dict["Location"] = location_span.getText().encode("utf-8")
program = school.find("div", {"class": ""})
# program_offer = program[0]
desc_p = program("div")[4].find("p")
if desc_p: # Sometimes the description could be blank
school_dict["Description"] = desc_p.getText().encode("utf-8")
else:
school_dict["Description"] = ""
offer_p = program("div")[6]
if offer_p:
school_dict["Programs offered"] = offer_p.getText().split(
":")[1].strip().encode("utf-8")
else:
school_dict["Programs offered"] = ""
school_list.append(school_dict)
# Keep navigating to the next page until the last page
next_page_div = soup.find("div", {"class": "visible-xs"})
if next_page_div:
next_page_url = next_page_div("a")[0].get("href")
# recursive call to fetch basic info from next page
get_basic_info(base_url + next_page_url)
return school_list
def get_detailed_info(df):
"""
Fetch detailed information from url of each school
"""
school_url = df["URL"] # returns pandas series
school_det_list = list()
for url in school_url:
school_det_dict = dict()
school_det_dict["URL"] = url
print("Extracting info from " + url)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "lxml")
# Extract summary profile information
full_div = soup.find("div", {"class": "row school-details pattern"})
if full_div:
stats_div = full_div.find(
"div", {"class": "col-xs-6 school-details-stats"})
for p in stats_div.findAll("p"):
spans = p.findAll("span")
school_det_dict[spans[0].getText().strip().replace("\n", " ").encode("utf-8")] = spans[1].getText().strip().replace("\n", " ").encode("utf-8")
other_details_div = full_div.findAll("div", {"class": "col-xs-6"})
school_det_dict["Other Details"] = other_details_div[0].getText().encode("utf-8")
"""
# Extract inforamtion for each program
programs_div = soup.find("div", {"id": "school-programs"})
for item in programs_div.findAll("div", {"class": "program-item-holder"}):
prog_title_div = item.find("div", {"class": "row program-item"})
title_divs = prog_title_div.findAll("div")
program_item = title_divs[0].getText()
school_det_dict[program_item] = title_divs[1].getText()
prod_detail_div = item.find(
"div", {"class": "accordion-body program-detail"})
for row in prod_detail_div.findAll("div", {"class": "row"}):
detail_divs = row.findAll("div")
school_det_dict[
program_item + " " + detail_divs[0].getText()] = detail_divs[1].getText()
"""
school_det_list.append(school_det_dict)
return school_det_list
def load_to_excel(df):
"""
Load the school list into excel file using pandas
"""
# Load list to dataframe
# df = pd.DataFrame(school_list)
df.index += 1 # So that the excel column starts from 1
# Write dataframe to excel
xlw = pd.ExcelWriter(xl_name, engine="openpyxl")
df.to_excel(xlw, sheet_name=sheet_name, index_label="#", encoding="utf-8")
# columns=["Name", "Location", "Description", "Programs offered", "URL"])
xlw.save()
def format_excel(xl, sheet="Sheet1"):
"""
Get the excel file path and format the file
If no sheet name is passed, by default take Sheet1
"""
# Open the excel file
wb = op.load_workbook(xl)
ws = wb.get_sheet_by_name(sheet)
# Freeze panes
ws.freeze_panes = "C2"
# Adjust column width
cols = ("A", "B", "C", "D", "E", "F")
widths = (5, 20, 15, 50, 20, 30)
for combo in zip(cols, widths):
ws.column_dimensions[combo[0]].width = combo[1]
# define color formatting
blue_fill = op.styles.PatternFill(start_color="00aadd",
fill_type='solid')
green_fill = op.styles.PatternFill(start_color="00ff00",
fill_type='solid')
# define border style
thin_border = op.styles.borders.Border(left=op.styles.Side(style='thin'),
right=op.styles.Side(style='thin'),
top=op.styles.Side(style='thin'),
bottom=op.styles.Side(style='thin'))
# define Text wrap
text_wrap = op.styles.Alignment(wrap_text=True)
# Format the header row
for row in range(1, 2): # Loop only the 1st row
for col in range(1, ws.max_column + 1): # loop through all columns
ws.cell(row=row, column=col).fill = blue_fill
# Format all cells
for row in ws.iter_rows():
for cell in row:
# Draw borders
cell.border = thin_border
# Wrap all columns
cell.alignment = text_wrap
# Highlight schools offering executive MBA
if (cell.col_idx == 5) and (cell.value.find("Executive") >= 0):
cell.fill = green_fill
# Save back as same file name
wb.save(xl)
if __name__ == "__main__":
""" Starting block """
# Fetch basic inforamtion for the 1st page
schools_basic = get_basic_info(base_url + start_url)
# Load basic info list to dataframe
basic_df = pd.DataFrame(schools_basic)
schools_det = get_detailed_info(basic_df)
# Load details info list to dataframe
det_df = pd.DataFrame(schools_det)
combined_df = pd.merge(
left=basic_df[
["Name", "Location", "Description", "Programs offered", "URL"]],
right=det_df, how="inner", on=["URL"])
# Load the list to an excel file
load_to_excel(combined_df)
# Format the excel file
format_excel(xl_name, sheet_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment