max-kov/analysis.py

## analysis.py
import colorsys

import pandas as pd
from scipy.stats import beta
from matplotlib import pyplot as plt
import math
import numpy as np
import matplotlib.patches as mpatches


excluded_colleges = []
def rgb_tuble_to_html(color):
    eight_byte_color = [int(math.floor(c * 255.)) for c in color]
    return "#{:02x}{:02x}{:02x}".format(*eight_byte_color)


def load_bound_df(year):
    df = pd.read_csv("{}.csv".format(year))
    df = df[list(set(df.columns) - set(excluded_colleges))]
    df = df.set_index("-")
    df = df.T
    df["Total Applications"] = df["Direct applications"] + df["Open applications"]
    df["Total Offers"] = df["Direct offers"] + df["Pool offers by other Colleges"]
    df["Total Rejections"] = df["Total Applications"] - df["Total Offers"]
    # Getting the 80% confidence interval using a Beta distribution
    year_intervals = [(college, beta.ppf([0.1, 0.9], row["Total Offers"] + 1, row["Total Rejections"] + 1))
                      for college, row in df.iterrows()]
    return pd.DataFrame([(c, l, u) for (c, (l, u)) in year_intervals], columns=["college", "lower", "upper"])


for year in [2017, 2016, 2015]:
    year_intervals = load_bound_df(year)
    year_2017_overview_plot = (year_intervals.set_index("college").sort_values("lower").plot(kind="bar", title="{} Overview".format(year)))
    plt.show()

years_to_analyze = [2013, 2014, 2015, 2016, 2017]
interval_dfs = []
for year in years_to_analyze:
    intervals = load_bound_df(year)
    intervals["year"] = str(year)
    interval_dfs.append(intervals)

interval_df = pd.concat(interval_dfs)
ax = plt.axes()
# Picked these based on the overview of 2015,2016,2017
selected_colleges = [u'Jesus College', u'Churchill College', u'Magdalene College', u'Emmanuel College', u'Gonville and Caius College', u'Pembroke College', u'Clare College', u"Queens' College"]
legend_patches = []
for college, color_index in zip(selected_colleges, np.linspace(0, 1., num=len(selected_colleges), endpoint=False)):
    college_vals = interval_df[interval_df["college"] == college].set_index("year")
    color = rgb_tuble_to_html(colorsys.hsv_to_rgb(color_index, 0.8, 0.8))
    college_vals["lower"].plot(style=":", ax=ax, color=color)
    college_vals["upper"].plot(style="-", ax=ax, color=color)
    legend_patches.append(mpatches.Patch(color=color, label=college))

plt.legend(handles=legend_patches)
plt.show()

## scraper.py
from bs4 import BeautifulSoup
import csv
import json
import os

for year_num in range(2010, 2018):
    year = str(year_num)
    curl_req = "curl 'https://www.undergraduate.study.cam.ac.uk/apply/statistics' -H 'origin: https://www.undergraduate.study.cam.ac.uk' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: en-US,en;q=0.8,ru;q=0.6,lv;q=0.4' -H 'cookie: X-Mapping-iejmlgke=D5A918F0CE6670C7B94E800FC55B2597; has_js=1' -H 'upgrade-insecure-requests: 1' -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' -H 'content-type: application/x-www-form-urlencoded' -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' -H 'cache-control: max-age=0' -H 'authority: www.undergraduate.study.cam.ac.uk' -H 'referer: https://www.undergraduate.study.cam.ac.uk/apply/statistics' -H 'dnt: 1' --data 'period=year&year="+year+"&app%5Bapplications%5D=applications&open%5Bopen%5D=open&off%5Boffers%5D=offers&winter%5Bwinter%5D=winter&acc%5Bacceptances%5D=acceptances&summer%5Bsummer%5D=summer&what=university&college=Christ%27s+College&course=Anglo-Saxon%2C+Norse%2C+and+Celtic&group=college&op=Show+graph&form_build_id=form-bRA6Yr2VG3zK5k1Fv170KMfcvDrohtLUlmxgYFDJCB0&form_id=cam_app_charts_my_form_1' --compressed"
    html = os.popen(curl_req).read()
    soup = BeautifulSoup(html,"html.parser")
    json_data = soup.find("div",{"class":"chart"}).get("data-chart")
    data = json.loads(json_data)
    colleges = ["-"]+data["xAxis"][0]["categories"]
    type = []
    num_data = []

    for thing in data["series"]:
        num_data.append(thing["data"])
        type.append(thing["name"])

    with open(year+".csv", 'w') as csvfile:
        parsed_data = csv.writer(csvfile)
        parsed_data.writerow(colleges)
        for i,data_type in enumerate(type):
            parsed_data.writerow([data_type]+num_data[i])
	import colorsys

	import pandas as pd
	from scipy.stats import beta
	from matplotlib import pyplot as plt
	import math
	import numpy as np
	import matplotlib.patches as mpatches


	excluded_colleges = []
	def rgb_tuble_to_html(color):
	eight_byte_color = [int(math.floor(c * 255.)) for c in color]
	return "#{:02x}{:02x}{:02x}".format(*eight_byte_color)


	def load_bound_df(year):
	df = pd.read_csv("{}.csv".format(year))
	df = df[list(set(df.columns) - set(excluded_colleges))]
	df = df.set_index("-")
	df = df.T
	df["Total Applications"] = df["Direct applications"] + df["Open applications"]
	df["Total Offers"] = df["Direct offers"] + df["Pool offers by other Colleges"]
	df["Total Rejections"] = df["Total Applications"] - df["Total Offers"]
	# Getting the 80% confidence interval using a Beta distribution
	year_intervals = [(college, beta.ppf([0.1, 0.9], row["Total Offers"] + 1, row["Total Rejections"] + 1))
	for college, row in df.iterrows()]
	return pd.DataFrame([(c, l, u) for (c, (l, u)) in year_intervals], columns=["college", "lower", "upper"])


	for year in [2017, 2016, 2015]:
	year_intervals = load_bound_df(year)
	year_2017_overview_plot = (year_intervals.set_index("college").sort_values("lower").plot(kind="bar", title="{} Overview".format(year)))
	plt.show()

	years_to_analyze = [2013, 2014, 2015, 2016, 2017]
	interval_dfs = []
	for year in years_to_analyze:
	intervals = load_bound_df(year)
	intervals["year"] = str(year)
	interval_dfs.append(intervals)

	interval_df = pd.concat(interval_dfs)
	ax = plt.axes()
	# Picked these based on the overview of 2015,2016,2017
	selected_colleges = [u'Jesus College', u'Churchill College', u'Magdalene College', u'Emmanuel College', u'Gonville and Caius College', u'Pembroke College', u'Clare College', u"Queens' College"]
	legend_patches = []
	for college, color_index in zip(selected_colleges, np.linspace(0, 1., num=len(selected_colleges), endpoint=False)):
	college_vals = interval_df[interval_df["college"] == college].set_index("year")
	color = rgb_tuble_to_html(colorsys.hsv_to_rgb(color_index, 0.8, 0.8))
	college_vals["lower"].plot(style=":", ax=ax, color=color)
	college_vals["upper"].plot(style="-", ax=ax, color=color)
	legend_patches.append(mpatches.Patch(color=color, label=college))

	plt.legend(handles=legend_patches)
	plt.show()
	from bs4 import BeautifulSoup
	import csv
	import json
	import os

	for year_num in range(2010, 2018):
	year = str(year_num)
	curl_req = "curl 'https://www.undergraduate.study.cam.ac.uk/apply/statistics' -H 'origin: https://www.undergraduate.study.cam.ac.uk' -H 'accept-encoding: gzip, deflate, br' -H 'accept-language: en-US,en;q=0.8,ru;q=0.6,lv;q=0.4' -H 'cookie: X-Mapping-iejmlgke=D5A918F0CE6670C7B94E800FC55B2597; has_js=1' -H 'upgrade-insecure-requests: 1' -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' -H 'content-type: application/x-www-form-urlencoded' -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8' -H 'cache-control: max-age=0' -H 'authority: www.undergraduate.study.cam.ac.uk' -H 'referer: https://www.undergraduate.study.cam.ac.uk/apply/statistics' -H 'dnt: 1' --data 'period=year&year="+year+"&app%5Bapplications%5D=applications&open%5Bopen%5D=open&off%5Boffers%5D=offers&winter%5Bwinter%5D=winter&acc%5Bacceptances%5D=acceptances&summer%5Bsummer%5D=summer&what=university&college=Christ%27s+College&course=Anglo-Saxon%2C+Norse%2C+and+Celtic&group=college&op=Show+graph&form_build_id=form-bRA6Yr2VG3zK5k1Fv170KMfcvDrohtLUlmxgYFDJCB0&form_id=cam_app_charts_my_form_1' --compressed"
	html = os.popen(curl_req).read()
	soup = BeautifulSoup(html,"html.parser")
	json_data = soup.find("div",{"class":"chart"}).get("data-chart")
	data = json.loads(json_data)
	colleges = ["-"]+data["xAxis"][0]["categories"]
	type = []
	num_data = []

	for thing in data["series"]:
	num_data.append(thing["data"])
	type.append(thing["name"])

	with open(year+".csv", 'w') as csvfile:
	parsed_data = csv.writer(csvfile)
	parsed_data.writerow(colleges)
	for i,data_type in enumerate(type):
	parsed_data.writerow([data_type]+num_data[i])