Skip to content

Instantly share code, notes, and snippets.

@christopherphan
Created April 29, 2020 19:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save christopherphan/1fb909448a967d37b3b345e308c8f568 to your computer and use it in GitHub Desktop.
Save christopherphan/1fb909448a967d37b3b345e308c8f568 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# leg_sizes.py
# Christopher Phan
# cphan@chrisphan.com
# https://chrisphan.com/
#
# Takes data from the Wikipedia article "List of legislatures by number of
# members", and peforms a linear regression on n = kP^alpha
# where n is the number of members of the lower house of a nation's legislature
# and P is the population.
#
# This script was written for Python 3.6.
#
# To run this script, save
# https://en.wikipedia.org/w/index.php?title=List_of_legislatures_by_number_of_members&oldid=951703531
# to legislatures.html
#
################################################################################
#
# Copyright 2020 Christopher Phan
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# See note above
data = pd.read_html("legislatures.html")[0]
def contains_min_or_max(x):
if ("minimum" in x) or ("maximum" in x):
return True
else:
return False
# Get rid of entries with only a minimum or maximum leg size listed
# (since we have no idea the actual size)
data = data[data["Lowerhouse[1]"].apply(contains_min_or_max) == False]
def number_parse(x):
if isinstance(x, int):
return x
else:
num = x.split("[")[0] # Get rid of footnotes
if "to" in num:
# Take the mean if it says "a to b"
splits = num.split(" to ")
num = (int(splits[0]) + int(splits[1]))/2
else:
# Get rid of words "usually" or "normally"
num = int(num.split(" ")[0])
return num
countries = data['Country']
leg_size = data["Lowerhouse[1]"].apply(number_parse)
pop_size = data["Population[2]"].apply(number_parse)
# List of OECD countries
# From http://www.oecd.org/about/members-and-partners/
oecd_countries = [
"Australia",
"Austria",
"Belgium",
"Canada",
"Chile",
"Czech Republic",
"Denmark",
"Estonia",
"Finland",
"France",
"Germany",
"Greece",
"Hungary",
"Iceland",
"Ireland",
"Israel",
"Italy",
"Japan",
"Korea, South", # Changed from "Korea" to match Wikipedia
"Latvia",
"Lithuania",
"Luxembourg",
"Mexico",
"Netherlands",
"New Zealand",
"Norway",
"Poland",
"Portugal",
"Slovakia", # Changed from "Slovak Republic" to match Wikipedia
"Slovenia",
"Spain",
"Sweden",
"Switzerland",
"Turkey",
"United Kingdom",
"United States"
]
# List of NATO countries
# From https://www.nato.int/cps/en/natohq/nato_countries.htm
nato_countries = [
"Albania",
"Belgium",
"Bulgaria",
"Canada",
"Croatia",
"Czech Republic",
"Denmark",
"Estonia",
"France",
"Germany",
"Greece",
"Hungary",
"Iceland",
"Italy",
"Latvia",
"Lithuania",
"Luxembourg",
"Montenegro",
"Netherlands",
"North Macedonia",
"Norway",
"Poland",
"Portugal",
"Romania",
"Slovakia",
"Slovenia",
"Spain",
"Turkey",
"United Kingdom",
"United States"
]
# List of European Union countries
# From https://europa.eu/european-union/about-eu/countries_en
eu_countries = [
"Austria", "Italy",
"Belgium", "Latvia",
"Bulgaria", "Lithuania",
"Croatia", "Luxembourg",
"Cyprus", "Malta",
"Czech Republic", "Netherlands", # Changed from "Czechia" to match Wikipedia
"Denmark", "Poland",
"Estonia", "Portugal",
"Finland", "Romania",
"France", "Slovakia",
"Germany", "Slovenia",
"Greece", "Spain",
"Hungary", "Sweden",
"Ireland"
]
# Output the parsed data to a new CSV file
newdata = pd.DataFrame({
'Country': countries,
'Lower house size': leg_size,
'Population': pop_size}
)
newdata["OECD"] = newdata["Country"].apply(
lambda x: x in oecd_countries
)
newdata["NATO"] = newdata["Country"].apply(
lambda x: x in nato_countries
)
newdata["EU"] = newdata["Country"].apply(
lambda x: x in eu_countries
)
newdata["OECD, NATO, or EU"] = newdata["Country"].apply(
lambda x: (x in oecd_countries) or
(x in nato_countries) or (x in eu_countries)
)
newdata.to_csv("Parsed_data.csv")
all_table = """
| Group of countries | Regression[^4] | $r^2$ | Plots |
| :------------------ | :--------------------------- |:-------------- |:--------------|
"""
for condition in ["all", "NATO", "OECD", "EU", "OECD, NATO, or EU"]:
# Calculate the regression
if condition == "all":
current_data = newdata
else:
current_data = newdata[newdata[condition] == True]
log_leg_size = np.log(current_data["Lower house size"])
log_pop_size = np.log(current_data["Population"])
m, b, r, p, stderr = stats.linregress(log_pop_size, log_leg_size)
def signstr(x):
if x < 0:
return "-"
else:
return "+"
filename_str = condition.replace(" ", "_")
filename_str = filename_str.replace(",", "")
bracket_to_brace = lambda x: x.replace("[", "{").replace("]", "}")
with open("regression_data_{}.txt".format(
filename_str), "wt") as outfile:
outfile.write("{} countries:\n".format(condition))
outfile.write("log(n) = {:0.4f}*log(P) {} {:0.4f}\n".format(
m, signstr(b), abs(b)))
outfile.write("n = {:0.4f}P^{:0.4f}\n".format(np.exp(b), m))
outfile.write("r^2 = {:0.4f}\n".format(r**2))
regression_text = "$n = {:0.4f}P^".format(np.exp(b))
regression_text += bracket_to_brace("[{:0.4f}]$".format(m))
# Add to markdown table
all_table += r"| {} | $\log(n) = {:0.4f}\log(P) {} {:0.4f}".format(
condition, m, signstr(b), abs(b))
all_table += r" \Rightarrow {}".format(
regression_text[1:])
all_table += r"| $r^2 = {:0.4f}$ |".format(r**2)
all_table += "[linear](/leg_sizes_2020-04/legsizes_{}_linear.png) ".format(
filename_str)
all_table += "[log](/leg_sizes_2020-04/legsizes_{}_log.png) |\n".format(
filename_str)
# Make plots
source_txt = 'Sources listed at: https://chrisphan.com/2020-04-29_what_size_are_national_legislatures/#sources'
maxpop = max(current_data["Population"])
maxsize = max(current_data["Lower house size"])
minpop = min(current_data["Population"])
minsize = min(current_data["Lower house size"])
for scale in ["linear", "log"]:
if scale == "linear":
xmin = 0
xmax = 1.1*maxpop
ymin = 0
ymax = 1.1*maxsize
else:
xmin = 10**(int(np.log(minpop)/np.log(10)))
xmax = 10**(int(np.log(maxpop)/np.log(10) + 1))
ymin = 10**(int(np.log(minsize)/np.log(10)))
ymax = 10**(int(np.log(maxsize)/np.log(10) + 1))
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
if scale == "linear":
xvals = np.arange(xmin/10**6, xmax/10**6)*10**6
else:
xvals = np.exp(np.arange(np.log(xmin), np.log(xmax) + 1))
yvals = np.exp(b)*(xvals)**m
yvals2 = xvals**(1/3)
plt.xscale(scale)
plt.yscale(scale)
if scale == 'linear':
fmtx = matplotlib.ticker.FuncFormatter(
lambda x, f: '{:,.0f} m'.format(x/10**6))
fmty = matplotlib.ticker.FuncFormatter(
lambda y, f: '{:,.0f}'.format(y))
ax.yaxis.set_major_formatter(fmty)
ax.xaxis.set_major_formatter(fmtx)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
plt.plot(current_data["Population"],
current_data["Lower house size"], "b.")
plt.plot(xvals, yvals, 'r', label=regression_text)
plt.plot(xvals, yvals2, 'g--', label= r'$n = \sqrt[3]{P}$')
plt.grid(True, which="both")
plt.legend()
title = "Population and lower legislative house size"
title += " ({} scale, {} countries)".format(
scale, condition)
plt.title(title)
plt.xlabel("Population ($P$)\n" + source_txt)
plt.ylabel("Lower house size ($n$)")
plt.tight_layout()
fig.savefig("legsizes_{}_{}.png".format(filename_str, scale))
fig.savefig("legsizes_{}_{}.pdf".format(filename_str, scale))
plt.close()
with open("regression_table.md", "wt") as markout:
markout.write(all_table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment