christopherphan/leg_sizes.py

## leg_sizes.py
#!/usr/bin/env python3

# leg_sizes.py
# Christopher Phan
# cphan@chrisphan.com
# https://chrisphan.com/
#
# Takes data from the Wikipedia article "List of legislatures by number of
# members", and peforms a linear regression on n = kP^alpha
# where n is the number of members of the lower house of a nation's legislature
# and P is the population.
#
# This script was written for Python 3.6.
#
# To run this script, save
# https://en.wikipedia.org/w/index.php?title=List_of_legislatures_by_number_of_members&oldid=951703531
# to legislatures.html
#
################################################################################
#
# Copyright 2020 Christopher Phan
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#

import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# See note above
data = pd.read_html("legislatures.html")[0]

def contains_min_or_max(x):
    if ("minimum" in x) or ("maximum" in x):
        return True
    else:
        return False

# Get rid of entries with only a minimum or maximum leg size listed
# (since we have no idea the actual size)
data = data[data["Lowerhouse[1]"].apply(contains_min_or_max) == False]

def number_parse(x):
    if isinstance(x, int):
        return x
    else:
        num = x.split("[")[0] # Get rid of footnotes
        if "to" in num:
            # Take the mean if it says "a to b"
            splits = num.split(" to ")
            num = (int(splits[0]) + int(splits[1]))/2
        else:
            # Get rid of words "usually" or "normally"
            num = int(num.split(" ")[0])
    return num


countries = data['Country']
leg_size = data["Lowerhouse[1]"].apply(number_parse)
pop_size = data["Population[2]"].apply(number_parse)

# List of OECD countries
# From http://www.oecd.org/about/members-and-partners/

oecd_countries = [
    "Australia",
    "Austria",
    "Belgium",
    "Canada",
    "Chile",
    "Czech Republic",
    "Denmark",
    "Estonia",
    "Finland",
    "France",
    "Germany",
    "Greece",
    "Hungary",
    "Iceland",
    "Ireland",
    "Israel",
    "Italy",
    "Japan",
    "Korea, South", # Changed from "Korea" to match Wikipedia
    "Latvia",
    "Lithuania",
    "Luxembourg",
    "Mexico",
    "Netherlands",
    "New Zealand",
    "Norway",
    "Poland",
    "Portugal",
    "Slovakia", # Changed from "Slovak Republic" to match Wikipedia
    "Slovenia",
    "Spain",
    "Sweden",
    "Switzerland",
    "Turkey",
    "United Kingdom",
    "United States"
]

# List of NATO countries
# From https://www.nato.int/cps/en/natohq/nato_countries.htm

nato_countries = [
    "Albania",
    "Belgium",
    "Bulgaria",
    "Canada",
    "Croatia",
    "Czech Republic",
    "Denmark",
    "Estonia",
    "France",
    "Germany",
    "Greece",
    "Hungary",
    "Iceland",
    "Italy",
    "Latvia",
    "Lithuania",
    "Luxembourg",
    "Montenegro",
    "Netherlands",
    "North Macedonia",
    "Norway",
    "Poland",
    "Portugal",
    "Romania",
    "Slovakia",
    "Slovenia",
    "Spain",
    "Turkey",
    "United Kingdom",
    "United States"
]

# List of European Union countries
# From https://europa.eu/european-union/about-eu/countries_en

eu_countries = [
    "Austria", "Italy",
    "Belgium", "Latvia",
    "Bulgaria", "Lithuania",
    "Croatia", "Luxembourg",
    "Cyprus", "Malta",
    "Czech Republic", "Netherlands", # Changed from "Czechia" to match Wikipedia
    "Denmark", "Poland",
    "Estonia", "Portugal",
    "Finland", "Romania",
    "France", "Slovakia",
    "Germany", "Slovenia",
    "Greece", "Spain",
    "Hungary", "Sweden",
    "Ireland"
]

# Output the parsed data to a new CSV file

newdata = pd.DataFrame({
    'Country': countries,
    'Lower house size': leg_size,
    'Population': pop_size}
)

newdata["OECD"] = newdata["Country"].apply(
    lambda x: x in oecd_countries
)

newdata["NATO"] = newdata["Country"].apply(
    lambda x: x in nato_countries
)

newdata["EU"] = newdata["Country"].apply(
    lambda x: x in eu_countries
)

newdata["OECD, NATO, or EU"] = newdata["Country"].apply(
    lambda x: (x in oecd_countries) or
        (x in nato_countries) or (x in eu_countries)
)

newdata.to_csv("Parsed_data.csv")


all_table = """
| Group of countries  | Regression[^4]               | $r^2$          | Plots         |
| :------------------ | :--------------------------- |:-------------- |:--------------|
"""

for condition in ["all", "NATO", "OECD", "EU", "OECD, NATO, or EU"]:

    # Calculate the regression
    if condition == "all":
        current_data = newdata
    else:
        current_data = newdata[newdata[condition] == True]
    log_leg_size = np.log(current_data["Lower house size"])
    log_pop_size = np.log(current_data["Population"])

    m, b, r, p, stderr = stats.linregress(log_pop_size, log_leg_size)

    def signstr(x):
        if x < 0:
            return "-"
        else:
            return "+"

    filename_str = condition.replace(" ", "_")
    filename_str = filename_str.replace(",", "")

    bracket_to_brace = lambda x: x.replace("[", "{").replace("]", "}")

    with open("regression_data_{}.txt".format(
        filename_str), "wt") as outfile:
            outfile.write("{} countries:\n".format(condition))
            outfile.write("log(n) = {:0.4f}*log(P) {} {:0.4f}\n".format(
                m, signstr(b), abs(b)))
            outfile.write("n = {:0.4f}P^{:0.4f}\n".format(np.exp(b), m))
            outfile.write("r^2 = {:0.4f}\n".format(r**2))

    regression_text = "$n = {:0.4f}P^".format(np.exp(b))
    regression_text += bracket_to_brace("[{:0.4f}]$".format(m))

    # Add to markdown table

    all_table += r"| {} | $\log(n) = {:0.4f}\log(P) {} {:0.4f}".format(
        condition, m, signstr(b), abs(b))
    all_table += r" \Rightarrow {}".format(
        regression_text[1:])
    all_table += r"| $r^2 = {:0.4f}$ |".format(r**2)
    all_table += "[linear](/leg_sizes_2020-04/legsizes_{}_linear.png) ".format(
        filename_str)
    all_table += "[log](/leg_sizes_2020-04/legsizes_{}_log.png) |\n".format(
        filename_str)

    # Make plots

    source_txt = 'Sources listed at: https://chrisphan.com/2020-04-29_what_size_are_national_legislatures/#sources'

    maxpop = max(current_data["Population"])
    maxsize = max(current_data["Lower house size"])
    minpop = min(current_data["Population"])
    minsize = min(current_data["Lower house size"])


    for scale in ["linear", "log"]:
        if scale == "linear":
            xmin = 0
            xmax = 1.1*maxpop
            ymin = 0
            ymax = 1.1*maxsize
        else:
            xmin = 10**(int(np.log(minpop)/np.log(10)))
            xmax = 10**(int(np.log(maxpop)/np.log(10) + 1))
            ymin = 10**(int(np.log(minsize)/np.log(10)))
            ymax = 10**(int(np.log(maxsize)/np.log(10) + 1))
        fig = plt.figure(figsize=(8, 6))
        ax = fig.add_subplot(111)
        if scale == "linear":
            xvals = np.arange(xmin/10**6, xmax/10**6)*10**6
        else:
            xvals = np.exp(np.arange(np.log(xmin), np.log(xmax) + 1))
        yvals = np.exp(b)*(xvals)**m
        yvals2 = xvals**(1/3)
        plt.xscale(scale)
        plt.yscale(scale)
        if scale == 'linear':
            fmtx = matplotlib.ticker.FuncFormatter(
                lambda x, f: '{:,.0f} m'.format(x/10**6))
            fmty = matplotlib.ticker.FuncFormatter(
                lambda y, f: '{:,.0f}'.format(y))
            ax.yaxis.set_major_formatter(fmty)
            ax.xaxis.set_major_formatter(fmtx)
        plt.xlim(xmin, xmax)
        plt.ylim(ymin, ymax)
        plt.plot(current_data["Population"],
            current_data["Lower house size"], "b.")
        plt.plot(xvals, yvals, 'r', label=regression_text)
        plt.plot(xvals, yvals2, 'g--', label= r'$n = \sqrt[3]{P}$')
        plt.grid(True, which="both")
        plt.legend()
        title = "Population and lower legislative house size"
        title += " ({} scale, {} countries)".format(
            scale, condition)
        plt.title(title)
        plt.xlabel("Population ($P$)\n" + source_txt)
        plt.ylabel("Lower house size ($n$)")
        plt.tight_layout()
        fig.savefig("legsizes_{}_{}.png".format(filename_str, scale))
        fig.savefig("legsizes_{}_{}.pdf".format(filename_str, scale))
        plt.close()

with open("regression_table.md", "wt") as markout:
    markout.write(all_table)
	#!/usr/bin/env python3

	# leg_sizes.py
	# Christopher Phan
	# cphan@chrisphan.com
	# https://chrisphan.com/
	#
	# Takes data from the Wikipedia article "List of legislatures by number of
	# members", and peforms a linear regression on n = kP^alpha
	# where n is the number of members of the lower house of a nation's legislature
	# and P is the population.
	#
	# This script was written for Python 3.6.
	#
	# To run this script, save
	# https://en.wikipedia.org/w/index.php?title=List_of_legislatures_by_number_of_members&oldid=951703531
	# to legislatures.html
	#
	################################################################################
	#
	# Copyright 2020 Christopher Phan
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.
	#

	import pandas as pd
	import scipy.stats as stats
	import numpy as np
	import matplotlib
	import matplotlib.pyplot as plt

	# See note above
	data = pd.read_html("legislatures.html")[0]

	def contains_min_or_max(x):
	if ("minimum" in x) or ("maximum" in x):
	return True
	else:
	return False

	# Get rid of entries with only a minimum or maximum leg size listed
	# (since we have no idea the actual size)
	data = data[data["Lowerhouse[1]"].apply(contains_min_or_max) == False]

	def number_parse(x):
	if isinstance(x, int):
	return x
	else:
	num = x.split("[")[0] # Get rid of footnotes
	if "to" in num:
	# Take the mean if it says "a to b"
	splits = num.split(" to ")
	num = (int(splits[0]) + int(splits[1]))/2
	else:
	# Get rid of words "usually" or "normally"
	num = int(num.split(" ")[0])
	return num


	countries = data['Country']
	leg_size = data["Lowerhouse[1]"].apply(number_parse)
	pop_size = data["Population[2]"].apply(number_parse)

	# List of OECD countries
	# From http://www.oecd.org/about/members-and-partners/

	oecd_countries = [
	"Australia",
	"Austria",
	"Belgium",
	"Canada",
	"Chile",
	"Czech Republic",
	"Denmark",
	"Estonia",
	"Finland",
	"France",
	"Germany",
	"Greece",
	"Hungary",
	"Iceland",
	"Ireland",
	"Israel",
	"Italy",
	"Japan",
	"Korea, South", # Changed from "Korea" to match Wikipedia
	"Latvia",
	"Lithuania",
	"Luxembourg",
	"Mexico",
	"Netherlands",
	"New Zealand",
	"Norway",
	"Poland",
	"Portugal",
	"Slovakia", # Changed from "Slovak Republic" to match Wikipedia
	"Slovenia",
	"Spain",
	"Sweden",
	"Switzerland",
	"Turkey",
	"United Kingdom",
	"United States"
	]

	# List of NATO countries
	# From https://www.nato.int/cps/en/natohq/nato_countries.htm

	nato_countries = [
	"Albania",
	"Belgium",
	"Bulgaria",
	"Canada",
	"Croatia",
	"Czech Republic",
	"Denmark",
	"Estonia",
	"France",
	"Germany",
	"Greece",
	"Hungary",
	"Iceland",
	"Italy",
	"Latvia",
	"Lithuania",
	"Luxembourg",
	"Montenegro",
	"Netherlands",
	"North Macedonia",
	"Norway",
	"Poland",
	"Portugal",
	"Romania",
	"Slovakia",
	"Slovenia",
	"Spain",
	"Turkey",
	"United Kingdom",
	"United States"
	]

	# List of European Union countries
	# From https://europa.eu/european-union/about-eu/countries_en

	eu_countries = [
	"Austria", "Italy",
	"Belgium", "Latvia",
	"Bulgaria", "Lithuania",
	"Croatia", "Luxembourg",
	"Cyprus", "Malta",
	"Czech Republic", "Netherlands", # Changed from "Czechia" to match Wikipedia
	"Denmark", "Poland",
	"Estonia", "Portugal",
	"Finland", "Romania",
	"France", "Slovakia",
	"Germany", "Slovenia",
	"Greece", "Spain",
	"Hungary", "Sweden",
	"Ireland"
	]

	# Output the parsed data to a new CSV file

	newdata = pd.DataFrame({
	'Country': countries,
	'Lower house size': leg_size,
	'Population': pop_size}
	)

	newdata["OECD"] = newdata["Country"].apply(
	lambda x: x in oecd_countries
	)

	newdata["NATO"] = newdata["Country"].apply(
	lambda x: x in nato_countries
	)

	newdata["EU"] = newdata["Country"].apply(
	lambda x: x in eu_countries
	)

	newdata["OECD, NATO, or EU"] = newdata["Country"].apply(
	lambda x: (x in oecd_countries) or
	(x in nato_countries) or (x in eu_countries)
	)

	newdata.to_csv("Parsed_data.csv")


	all_table = """
	\| Group of countries \| Regression[^4] \| $r^2$ \| Plots \|
	\| :------------------ \| :--------------------------- \|:-------------- \|:--------------\|
	"""

	for condition in ["all", "NATO", "OECD", "EU", "OECD, NATO, or EU"]:

	# Calculate the regression
	if condition == "all":
	current_data = newdata
	else:
	current_data = newdata[newdata[condition] == True]
	log_leg_size = np.log(current_data["Lower house size"])
	log_pop_size = np.log(current_data["Population"])

	m, b, r, p, stderr = stats.linregress(log_pop_size, log_leg_size)

	def signstr(x):
	if x < 0:
	return "-"
	else:
	return "+"

	filename_str = condition.replace(" ", "_")
	filename_str = filename_str.replace(",", "")

	bracket_to_brace = lambda x: x.replace("[", "{").replace("]", "}")

	with open("regression_data_{}.txt".format(
	filename_str), "wt") as outfile:
	outfile.write("{} countries:\n".format(condition))
	outfile.write("log(n) = {:0.4f}*log(P) {} {:0.4f}\n".format(
	m, signstr(b), abs(b)))
	outfile.write("n = {:0.4f}P^{:0.4f}\n".format(np.exp(b), m))
	outfile.write("r^2 = {:0.4f}\n".format(r**2))

	regression_text = "$n = {:0.4f}P^".format(np.exp(b))
	regression_text += bracket_to_brace("[{:0.4f}]$".format(m))

	# Add to markdown table

	all_table += r"\| {} \| $\log(n) = {:0.4f}\log(P) {} {:0.4f}".format(
	condition, m, signstr(b), abs(b))
	all_table += r" \Rightarrow {}".format(
	regression_text[1:])
	all_table += r"\| $r^2 = {:0.4f}$ \|".format(r**2)
	all_table += "[linear](/leg_sizes_2020-04/legsizes_{}_linear.png) ".format(
	filename_str)
	all_table += "[log](/leg_sizes_2020-04/legsizes_{}_log.png) \|\n".format(
	filename_str)

	# Make plots

	source_txt = 'Sources listed at: https://chrisphan.com/2020-04-29_what_size_are_national_legislatures/#sources'

	maxpop = max(current_data["Population"])
	maxsize = max(current_data["Lower house size"])
	minpop = min(current_data["Population"])
	minsize = min(current_data["Lower house size"])


	for scale in ["linear", "log"]:
	if scale == "linear":
	xmin = 0
	xmax = 1.1*maxpop
	ymin = 0
	ymax = 1.1*maxsize
	else:
	xmin = 10**(int(np.log(minpop)/np.log(10)))
	xmax = 10**(int(np.log(maxpop)/np.log(10) + 1))
	ymin = 10**(int(np.log(minsize)/np.log(10)))
	ymax = 10**(int(np.log(maxsize)/np.log(10) + 1))
	fig = plt.figure(figsize=(8, 6))
	ax = fig.add_subplot(111)
	if scale == "linear":
	xvals = np.arange(xmin/106, xmax/106)10*6
	else:
	xvals = np.exp(np.arange(np.log(xmin), np.log(xmax) + 1))
	yvals = np.exp(b)(xvals)*m
	yvals2 = xvals**(1/3)
	plt.xscale(scale)
	plt.yscale(scale)
	if scale == 'linear':
	fmtx = matplotlib.ticker.FuncFormatter(
	lambda x, f: '{:,.0f} m'.format(x/10**6))
	fmty = matplotlib.ticker.FuncFormatter(
	lambda y, f: '{:,.0f}'.format(y))
	ax.yaxis.set_major_formatter(fmty)
	ax.xaxis.set_major_formatter(fmtx)
	plt.xlim(xmin, xmax)
	plt.ylim(ymin, ymax)
	plt.plot(current_data["Population"],
	current_data["Lower house size"], "b.")
	plt.plot(xvals, yvals, 'r', label=regression_text)
	plt.plot(xvals, yvals2, 'g--', label= r'$n = \sqrt[3]{P}$')
	plt.grid(True, which="both")
	plt.legend()
	title = "Population and lower legislative house size"
	title += " ({} scale, {} countries)".format(
	scale, condition)
	plt.title(title)
	plt.xlabel("Population ($P$)\n" + source_txt)
	plt.ylabel("Lower house size ($n$)")
	plt.tight_layout()
	fig.savefig("legsizes_{}_{}.png".format(filename_str, scale))
	fig.savefig("legsizes_{}_{}.pdf".format(filename_str, scale))
	plt.close()

	with open("regression_table.md", "wt") as markout:
	markout.write(all_table)