Skip to content

Instantly share code, notes, and snippets.

@max-kov
Last active May 21, 2020 18:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save max-kov/10fb29e3716a1756e4995fee72c6da4d to your computer and use it in GitHub Desktop.
Save max-kov/10fb29e3716a1756e4995fee72c6da4d to your computer and use it in GitHub Desktop.
Cambridge colleges admissions scraper and analyser
import colorsys
import pandas as pd
from scipy.stats import beta
from matplotlib import pyplot as plt
import math
import numpy as np
import matplotlib.patches as mpatches
excluded_colleges = []
def rgb_tuble_to_html(color):
eight_byte_color = [int(math.floor(c * 255.)) for c in color]
return "#{:02x}{:02x}{:02x}".format(*eight_byte_color)
def load_bound_df(year):
df = pd.read_csv("{}.csv".format(year))
df = df[list(set(df.columns) - set(excluded_colleges))]
df = df.set_index("-")
df = df.T
df["Total Applications"] = df["Direct applications"] + df["Open applications"]
df["Total Offers"] = df["Direct offers"] + df["Pool offers by other Colleges"]
df["Total Rejections"] = df["Total Applications"] - df["Total Offers"]
# Getting the 80% confidence interval using a Beta distribution
year_intervals = [(college, beta.ppf([0.1, 0.9], row["Total Offers"] + 1, row["Total Rejections"] + 1))
for college, row in df.iterrows()]
return pd.DataFrame([(c, l, u) for (c, (l, u)) in year_intervals], columns=["college", "lower", "upper"])
for year in [2017, 2016, 2015]:
year_intervals = load_bound_df(year)
year_2017_overview_plot = (year_intervals.set_index("college").sort_values("lower").plot(kind="bar", title="{} Overview".format(year)))
plt.show()
years_to_analyze = [2013, 2014, 2015, 2016, 2017]
interval_dfs = []
for year in years_to_analyze:
intervals = load_bound_df(year)
intervals["year"] = str(year)
interval_dfs.append(intervals)
interval_df = pd.concat(interval_dfs)
ax = plt.axes()
# Picked these based on the overview of 2015,2016,2017
selected_colleges = [u'Jesus College', u'Churchill College', u'Magdalene College', u'Emmanuel College', u'Gonville and Caius College', u'Pembroke College', u'Clare College', u"Queens' College"]
legend_patches = []
for college, color_index in zip(selected_colleges, np.linspace(0, 1., num=len(selected_colleges), endpoint=False)):
college_vals = interval_df[interval_df["college"] == college].set_index("year")
color = rgb_tuble_to_html(colorsys.hsv_to_rgb(color_index, 0.8, 0.8))
college_vals["lower"].plot(style=":", ax=ax, color=color)
college_vals["upper"].plot(style="-", ax=ax, color=color)
legend_patches.append(mpatches.Patch(color=color, label=college))
plt.legend(handles=legend_patches)
plt.show()
from bs4 import BeautifulSoup
import csv
import json
import os
for year_num in range(2010, 2018):
year = str(year_num)
curl_req = "curl 'https://www.undergraduate.study.cam.ac.uk/apply/statistics' -H 'origin: https://www.undergraduate.study.cam.ac.uk' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: en-US,en;q=0.8,ru;q=0.6,lv;q=0.4' -H 'cookie: X-Mapping-iejmlgke=D5A918F0CE6670C7B94E800FC55B2597; has_js=1' -H 'upgrade-insecure-requests: 1' -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' -H 'content-type: application/x-www-form-urlencoded' -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' -H 'cache-control: max-age=0' -H 'authority: www.undergraduate.study.cam.ac.uk' -H 'referer: https://www.undergraduate.study.cam.ac.uk/apply/statistics' -H 'dnt: 1' --data 'period=year&year="+year+"&app%5Bapplications%5D=applications&open%5Bopen%5D=open&off%5Boffers%5D=offers&winter%5Bwinter%5D=winter&acc%5Bacceptances%5D=acceptances&summer%5Bsummer%5D=summer&what=university&college=Christ%27s+College&course=Anglo-Saxon%2C+Norse%2C+and+Celtic&group=college&op=Show+graph&form_build_id=form-bRA6Yr2VG3zK5k1Fv170KMfcvDrohtLUlmxgYFDJCB0&form_id=cam_app_charts_my_form_1' --compressed"
html = os.popen(curl_req).read()
soup = BeautifulSoup(html,"html.parser")
json_data = soup.find("div",{"class":"chart"}).get("data-chart")
data = json.loads(json_data)
colleges = ["-"]+data["xAxis"][0]["categories"]
type = []
num_data = []
for thing in data["series"]:
num_data.append(thing["data"])
type.append(thing["name"])
with open(year+".csv", 'w') as csvfile:
parsed_data = csv.writer(csvfile)
parsed_data.writerow(colleges)
for i,data_type in enumerate(type):
parsed_data.writerow([data_type]+num_data[i])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment