#!/usr/bin/env python3 | |
# leg_sizes.py | |
# Christopher Phan | |
# cphan@chrisphan.com | |
# https://chrisphan.com/ | |
# | |
# Takes data from the Wikipedia article "List of legislatures by number of | |
# members", and peforms a linear regression on n = kP^alpha | |
# where n is the number of members of the lower house of a nation's legislature | |
# and P is the population. | |
# | |
# This script was written for Python 3.6. | |
# | |
# To run this script, save | |
# https://en.wikipedia.org/w/index.php?title=List_of_legislatures_by_number_of_members&oldid=951703531 | |
# to legislatures.html | |
# | |
################################################################################ | |
# | |
# Copyright 2020 Christopher Phan | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
# | |
import pandas as pd | |
import scipy.stats as stats | |
import numpy as np | |
import matplotlib | |
import matplotlib.pyplot as plt | |
# See note above | |
data = pd.read_html("legislatures.html")[0] | |
def contains_min_or_max(x): | |
if ("minimum" in x) or ("maximum" in x): | |
return True | |
else: | |
return False | |
# Get rid of entries with only a minimum or maximum leg size listed | |
# (since we have no idea the actual size) | |
data = data[data["Lowerhouse[1]"].apply(contains_min_or_max) == False] | |
def number_parse(x): | |
if isinstance(x, int): | |
return x | |
else: | |
num = x.split("[")[0] # Get rid of footnotes | |
if "to" in num: | |
# Take the mean if it says "a to b" | |
splits = num.split(" to ") | |
num = (int(splits[0]) + int(splits[1]))/2 | |
else: | |
# Get rid of words "usually" or "normally" | |
num = int(num.split(" ")[0]) | |
return num | |
countries = data['Country'] | |
leg_size = data["Lowerhouse[1]"].apply(number_parse) | |
pop_size = data["Population[2]"].apply(number_parse) | |
# List of OECD countries | |
# From http://www.oecd.org/about/members-and-partners/ | |
oecd_countries = [ | |
"Australia", | |
"Austria", | |
"Belgium", | |
"Canada", | |
"Chile", | |
"Czech Republic", | |
"Denmark", | |
"Estonia", | |
"Finland", | |
"France", | |
"Germany", | |
"Greece", | |
"Hungary", | |
"Iceland", | |
"Ireland", | |
"Israel", | |
"Italy", | |
"Japan", | |
"Korea, South", # Changed from "Korea" to match Wikipedia | |
"Latvia", | |
"Lithuania", | |
"Luxembourg", | |
"Mexico", | |
"Netherlands", | |
"New Zealand", | |
"Norway", | |
"Poland", | |
"Portugal", | |
"Slovakia", # Changed from "Slovak Republic" to match Wikipedia | |
"Slovenia", | |
"Spain", | |
"Sweden", | |
"Switzerland", | |
"Turkey", | |
"United Kingdom", | |
"United States" | |
] | |
# List of NATO countries | |
# From https://www.nato.int/cps/en/natohq/nato_countries.htm | |
nato_countries = [ | |
"Albania", | |
"Belgium", | |
"Bulgaria", | |
"Canada", | |
"Croatia", | |
"Czech Republic", | |
"Denmark", | |
"Estonia", | |
"France", | |
"Germany", | |
"Greece", | |
"Hungary", | |
"Iceland", | |
"Italy", | |
"Latvia", | |
"Lithuania", | |
"Luxembourg", | |
"Montenegro", | |
"Netherlands", | |
"North Macedonia", | |
"Norway", | |
"Poland", | |
"Portugal", | |
"Romania", | |
"Slovakia", | |
"Slovenia", | |
"Spain", | |
"Turkey", | |
"United Kingdom", | |
"United States" | |
] | |
# List of European Union countries | |
# From https://europa.eu/european-union/about-eu/countries_en | |
eu_countries = [ | |
"Austria", "Italy", | |
"Belgium", "Latvia", | |
"Bulgaria", "Lithuania", | |
"Croatia", "Luxembourg", | |
"Cyprus", "Malta", | |
"Czech Republic", "Netherlands", # Changed from "Czechia" to match Wikipedia | |
"Denmark", "Poland", | |
"Estonia", "Portugal", | |
"Finland", "Romania", | |
"France", "Slovakia", | |
"Germany", "Slovenia", | |
"Greece", "Spain", | |
"Hungary", "Sweden", | |
"Ireland" | |
] | |
# Output the parsed data to a new CSV file | |
newdata = pd.DataFrame({ | |
'Country': countries, | |
'Lower house size': leg_size, | |
'Population': pop_size} | |
) | |
newdata["OECD"] = newdata["Country"].apply( | |
lambda x: x in oecd_countries | |
) | |
newdata["NATO"] = newdata["Country"].apply( | |
lambda x: x in nato_countries | |
) | |
newdata["EU"] = newdata["Country"].apply( | |
lambda x: x in eu_countries | |
) | |
newdata["OECD, NATO, or EU"] = newdata["Country"].apply( | |
lambda x: (x in oecd_countries) or | |
(x in nato_countries) or (x in eu_countries) | |
) | |
newdata.to_csv("Parsed_data.csv") | |
all_table = """ | |
| Group of countries | Regression[^4] | $r^2$ | Plots | | |
| :------------------ | :--------------------------- |:-------------- |:--------------| | |
""" | |
for condition in ["all", "NATO", "OECD", "EU", "OECD, NATO, or EU"]: | |
# Calculate the regression | |
if condition == "all": | |
current_data = newdata | |
else: | |
current_data = newdata[newdata[condition] == True] | |
log_leg_size = np.log(current_data["Lower house size"]) | |
log_pop_size = np.log(current_data["Population"]) | |
m, b, r, p, stderr = stats.linregress(log_pop_size, log_leg_size) | |
def signstr(x): | |
if x < 0: | |
return "-" | |
else: | |
return "+" | |
filename_str = condition.replace(" ", "_") | |
filename_str = filename_str.replace(",", "") | |
bracket_to_brace = lambda x: x.replace("[", "{").replace("]", "}") | |
with open("regression_data_{}.txt".format( | |
filename_str), "wt") as outfile: | |
outfile.write("{} countries:\n".format(condition)) | |
outfile.write("log(n) = {:0.4f}*log(P) {} {:0.4f}\n".format( | |
m, signstr(b), abs(b))) | |
outfile.write("n = {:0.4f}P^{:0.4f}\n".format(np.exp(b), m)) | |
outfile.write("r^2 = {:0.4f}\n".format(r**2)) | |
regression_text = "$n = {:0.4f}P^".format(np.exp(b)) | |
regression_text += bracket_to_brace("[{:0.4f}]$".format(m)) | |
# Add to markdown table | |
all_table += r"| {} | $\log(n) = {:0.4f}\log(P) {} {:0.4f}".format( | |
condition, m, signstr(b), abs(b)) | |
all_table += r" \Rightarrow {}".format( | |
regression_text[1:]) | |
all_table += r"| $r^2 = {:0.4f}$ |".format(r**2) | |
all_table += "[linear](/leg_sizes_2020-04/legsizes_{}_linear.png) ".format( | |
filename_str) | |
all_table += "[log](/leg_sizes_2020-04/legsizes_{}_log.png) |\n".format( | |
filename_str) | |
# Make plots | |
source_txt = 'Sources listed at: https://chrisphan.com/2020-04-29_what_size_are_national_legislatures/#sources' | |
maxpop = max(current_data["Population"]) | |
maxsize = max(current_data["Lower house size"]) | |
minpop = min(current_data["Population"]) | |
minsize = min(current_data["Lower house size"]) | |
for scale in ["linear", "log"]: | |
if scale == "linear": | |
xmin = 0 | |
xmax = 1.1*maxpop | |
ymin = 0 | |
ymax = 1.1*maxsize | |
else: | |
xmin = 10**(int(np.log(minpop)/np.log(10))) | |
xmax = 10**(int(np.log(maxpop)/np.log(10) + 1)) | |
ymin = 10**(int(np.log(minsize)/np.log(10))) | |
ymax = 10**(int(np.log(maxsize)/np.log(10) + 1)) | |
fig = plt.figure(figsize=(8, 6)) | |
ax = fig.add_subplot(111) | |
if scale == "linear": | |
xvals = np.arange(xmin/10**6, xmax/10**6)*10**6 | |
else: | |
xvals = np.exp(np.arange(np.log(xmin), np.log(xmax) + 1)) | |
yvals = np.exp(b)*(xvals)**m | |
yvals2 = xvals**(1/3) | |
plt.xscale(scale) | |
plt.yscale(scale) | |
if scale == 'linear': | |
fmtx = matplotlib.ticker.FuncFormatter( | |
lambda x, f: '{:,.0f} m'.format(x/10**6)) | |
fmty = matplotlib.ticker.FuncFormatter( | |
lambda y, f: '{:,.0f}'.format(y)) | |
ax.yaxis.set_major_formatter(fmty) | |
ax.xaxis.set_major_formatter(fmtx) | |
plt.xlim(xmin, xmax) | |
plt.ylim(ymin, ymax) | |
plt.plot(current_data["Population"], | |
current_data["Lower house size"], "b.") | |
plt.plot(xvals, yvals, 'r', label=regression_text) | |
plt.plot(xvals, yvals2, 'g--', label= r'$n = \sqrt[3]{P}$') | |
plt.grid(True, which="both") | |
plt.legend() | |
title = "Population and lower legislative house size" | |
title += " ({} scale, {} countries)".format( | |
scale, condition) | |
plt.title(title) | |
plt.xlabel("Population ($P$)\n" + source_txt) | |
plt.ylabel("Lower house size ($n$)") | |
plt.tight_layout() | |
fig.savefig("legsizes_{}_{}.png".format(filename_str, scale)) | |
fig.savefig("legsizes_{}_{}.pdf".format(filename_str, scale)) | |
plt.close() | |
with open("regression_table.md", "wt") as markout: | |
markout.write(all_table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment