vinovator/UKBschools.py

## UKBschools.py
# UKBschools.py
# Python 3.5


"""
Scrap the list of all accredated BSchools in UK from find-mba.com
export the list to excel
"""

import requests
from bs4 import BeautifulSoup
import urllib.parse
import sys


base_url = "http://find-mba.com/schools/uk-ireland/uk?"

filter_critera = {
    "keywords": "",  # Filter by Keywords
    "rank": "false",  # To select only ranked schools by FT etc.
    "accredition": "true",  # To select only accredited schools
    "cities": "",  # Filter schools by specific citities
    "Specs": "",  # Select schools by specialisation
    "sort": "popularity",  # Sort by popularity
    "numberperpage": 8,  # no of search results per page
    "page": "1"  # Page number
}

params = urllib.parse.urlencode(filter_critera)


def get_detailed_info(url):
    """
    Given a Bschool url extract the acredition and programs information
    """
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, "lxml")


def get_basic_info(html):
    """
    Extract basic level info such as bane, location etc.
    """
    basic_info_dict = dict()

    row = html.find("div", {"class": "row"})

    # Bschool Name
    basic_info_dict["Name"] = row.find(
        "div",
        {"class": "col-xs-11 school-list-title"})("a")[0].get("title")

    # Bschool website link
    basic_info_dict["Website"] = row.find(
        "div",
        {"class": "col-xs-11 school-list-title"})("a")[0].get("href")

    details = html.find("div", {"class": "school-list-details"})

    location = details.find("span", {"class": "school-list-location"}).text

    basic_info_dict["City"], school_dict["Country"] = location.split(",")

    # School programs info
    programs_div = school.find("div", {"class": "school-list-programs"})

    prog_list = list()

    # print(programs_div)

    for program in programs_div.find_all("p"):
        # basic_info_dict[program.find("strong").text] = program.find("a").get("href")
        prog_name = program.find("strong").text.strip(":")
        prog_list.append(prog_name)

    basic_info_dict["Programs"] = ", ".join(prog_list)

    # Given Bschool Url extract other detailed information
    basic_info_dict.append(get_detailed_info(basic_info_dict["Website"]))

    return basic_info_dict


if __name__ == "__main__":
    """ Starting block """

    try:
        resp = requests.get(base_url + params)
        if not resp.ok:
            print(resp.status_code + ": " + resp.reason)
            sys.exit(0)

        soup = BeautifulSoup(resp.content, "lxml")
        # print(soup.prettify())

        school_list = list()

        for school in soup.find_all("div", {"class": "row school-list-item"}):
            school_dict = dict()
            # print(school)

            # Extract basic level info such as bane, location etc.
            school_dict = get_basic_info(school)

            school_list.append(school_dict)

        print(school_list)

    except Exception as e:
        print(e)
	# UKBschools.py
	# Python 3.5


	"""
	Scrap the list of all accredated BSchools in UK from find-mba.com
	export the list to excel
	"""

	import requests
	from bs4 import BeautifulSoup
	import urllib.parse
	import sys


	base_url = "http://find-mba.com/schools/uk-ireland/uk?"

	filter_critera = {
	"keywords": "", # Filter by Keywords
	"rank": "false", # To select only ranked schools by FT etc.
	"accredition": "true", # To select only accredited schools
	"cities": "", # Filter schools by specific citities
	"Specs": "", # Select schools by specialisation
	"sort": "popularity", # Sort by popularity
	"numberperpage": 8, # no of search results per page
	"page": "1" # Page number
	}

	params = urllib.parse.urlencode(filter_critera)


	def get_detailed_info(url):
	"""
	Given a Bschool url extract the acredition and programs information
	"""
	resp = requests.get(url)
	soup = BeautifulSoup(resp.content, "lxml")


	def get_basic_info(html):
	"""
	Extract basic level info such as bane, location etc.
	"""
	basic_info_dict = dict()

	row = html.find("div", {"class": "row"})

	# Bschool Name
	basic_info_dict["Name"] = row.find(
	"div",
	{"class": "col-xs-11 school-list-title"})("a")[0].get("title")

	# Bschool website link
	basic_info_dict["Website"] = row.find(
	"div",
	{"class": "col-xs-11 school-list-title"})("a")[0].get("href")

	details = html.find("div", {"class": "school-list-details"})

	location = details.find("span", {"class": "school-list-location"}).text

	basic_info_dict["City"], school_dict["Country"] = location.split(",")

	# School programs info
	programs_div = school.find("div", {"class": "school-list-programs"})

	prog_list = list()

	# print(programs_div)

	for program in programs_div.find_all("p"):
	# basic_info_dict[program.find("strong").text] = program.find("a").get("href")
	prog_name = program.find("strong").text.strip(":")
	prog_list.append(prog_name)

	basic_info_dict["Programs"] = ", ".join(prog_list)

	# Given Bschool Url extract other detailed information
	basic_info_dict.append(get_detailed_info(basic_info_dict["Website"]))

	return basic_info_dict


	if __name__ == "__main__":
	""" Starting block """

	try:
	resp = requests.get(base_url + params)
	if not resp.ok:
	print(resp.status_code + ": " + resp.reason)
	sys.exit(0)

	soup = BeautifulSoup(resp.content, "lxml")
	# print(soup.prettify())

	school_list = list()

	for school in soup.find_all("div", {"class": "row school-list-item"}):
	school_dict = dict()
	# print(school)

	# Extract basic level info such as bane, location etc.
	school_dict = get_basic_info(school)

	school_list.append(school_dict)

	print(school_list)

	except Exception as e:
	print(e)