Skip to content

Instantly share code, notes, and snippets.

@vinovator
Created March 29, 2017 09:21
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save vinovator/edfc84780a2e5511cee221e803c50d34 to your computer and use it in GitHub Desktop.
# UKBschools.py
# Python 3.5
"""
Scrap the list of all accredated BSchools in UK from find-mba.com
export the list to excel
"""
import requests
from bs4 import BeautifulSoup
import urllib.parse
import sys
base_url = "http://find-mba.com/schools/uk-ireland/uk?"
filter_critera = {
"keywords": "", # Filter by Keywords
"rank": "false", # To select only ranked schools by FT etc.
"accredition": "true", # To select only accredited schools
"cities": "", # Filter schools by specific citities
"Specs": "", # Select schools by specialisation
"sort": "popularity", # Sort by popularity
"numberperpage": 8, # no of search results per page
"page": "1" # Page number
}
params = urllib.parse.urlencode(filter_critera)
def get_detailed_info(url):
"""
Given a Bschool url extract the acredition and programs information
"""
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "lxml")
def get_basic_info(html):
"""
Extract basic level info such as bane, location etc.
"""
basic_info_dict = dict()
row = html.find("div", {"class": "row"})
# Bschool Name
basic_info_dict["Name"] = row.find(
"div",
{"class": "col-xs-11 school-list-title"})("a")[0].get("title")
# Bschool website link
basic_info_dict["Website"] = row.find(
"div",
{"class": "col-xs-11 school-list-title"})("a")[0].get("href")
details = html.find("div", {"class": "school-list-details"})
location = details.find("span", {"class": "school-list-location"}).text
basic_info_dict["City"], school_dict["Country"] = location.split(",")
# School programs info
programs_div = school.find("div", {"class": "school-list-programs"})
prog_list = list()
# print(programs_div)
for program in programs_div.find_all("p"):
# basic_info_dict[program.find("strong").text] = program.find("a").get("href")
prog_name = program.find("strong").text.strip(":")
prog_list.append(prog_name)
basic_info_dict["Programs"] = ", ".join(prog_list)
# Given Bschool Url extract other detailed information
basic_info_dict.append(get_detailed_info(basic_info_dict["Website"]))
return basic_info_dict
if __name__ == "__main__":
""" Starting block """
try:
resp = requests.get(base_url + params)
if not resp.ok:
print(resp.status_code + ": " + resp.reason)
sys.exit(0)
soup = BeautifulSoup(resp.content, "lxml")
# print(soup.prettify())
school_list = list()
for school in soup.find_all("div", {"class": "row school-list-item"}):
school_dict = dict()
# print(school)
# Extract basic level info such as bane, location etc.
school_dict = get_basic_info(school)
school_list.append(school_dict)
print(school_list)
except Exception as e:
print(e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment