Created
October 10, 2018 19:46
-
-
Save psobczyk/48efeaf4214b419524ccf6cda2e0eeb1 to your computer and use it in GitHub Desktop.
analysing number of parliament members worldwide
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(dplyr) | |
require(rvest) | |
require(ggplot2) | |
require(ggthemes) | |
url <- "https://en.wikipedia.org/wiki/List_of_legislatures_by_number_of_members" | |
html_downloaded <- read_html(url) | |
tables<- url %>% read_html %>% | |
html_table(fill = T) | |
parliament_seats <- tables[[1]] | |
clean_text <- function(text){ | |
gsub(',', '', gsub('.*♠(.*)', '\\1', text)) | |
} | |
parliament_seats <- parliament_seats %>% | |
mutate_at(vars(`Lowerhouse[1]`:`Population/seats`), funs(clean_text)) %>% | |
mutate_at(vars(`Lowerhouse[1]`:`Upperhouse[1]`, Total), funs(gsub('([0-9]*).*', '\\1', .))) %>% | |
mutate_at(vars(`Lowerhouse[1]`:`Population/seats`), funs(as.numeric)) %>% | |
replace_na(list(`Upperhouse[1]` = 0, `Lowerhouse[1]` = 0)) %>% | |
mutate(Total = `Lowerhouse[1]`+`Upperhouse[1]`, | |
`Population/seats`=`Population[2]`/Total) | |
ggplot(parliament_seats, aes(x=`Population[2]`, y=`Total`, | |
color = ifelse(Type == 'bicameral', 'Dwuizbowy', 'Jednoizbowy'))) + | |
geom_point() + | |
scale_x_log10(labels=scales::number_format()) + | |
scale_y_log10(labels=scales::number_format()) + | |
scale_color_viridis_d() + | |
theme_fivethirtyeight(base_family = 'Helvetica Neue Light') + | |
theme(axis.title = element_text(inherit.blank = F)) + | |
labs(x = NULL, y = 'Miejsc w parlamencie', color = NULL, title = 'Ludność kraju a liczba parlamentarzystów') | |
# fittin simple model | |
seats_pop_lm <- lm(I(log(`Total`))~log(`Population[2]`) + I(log(`Population[2]`)^2) + `Type`, | |
data = parliament_seats) | |
#quick diagnostics and summary | |
plot(seats_pop_lm, 1:2) | |
summary(seats_pop_lm) | |
#in Poland there is over representations | |
seats_pop_lm$residuals[parliament_seats$Country=='Poland'] | |
#in reality it is 560 | |
exp(seats_pop_lm$fitted.values)[parliament_seats$Country=='Poland'] | |
parliament_seats %>% | |
mutate(type = ifelse(Country == 'Poland', 'Polska', ifelse(Type == 'bicameral', 'Dwuizbowy parlament', 'Jednoizbowy parlament'))) %>% | |
ggplot(aes(x=`Population[2]`, y=`Total`, color = type)) + | |
geom_point() + | |
scale_x_log10(labels=scales::number_format()) + | |
scale_y_log10(labels=scales::number_format()) + | |
scale_color_viridis_d() + | |
facet_wrap(~cut(`Population[2]`, c(0, 1e6, 3e7, 1e10), labels = c('poniżej miliona', 'mniej niż 20 milionów', 'powyżej 10 milionów')), | |
scales = 'free') + | |
theme_fivethirtyeight(base_family = 'Helvetica Neue Light') + | |
theme(axis.title = element_text(inherit.blank = F)) + | |
labs(x = NULL, color = NULL, y = 'Mieszkańców na jedno miejsce w Parlamencie', title = 'Ludność kraju a liczba parlamentarzystów') + | |
ggsave('miejsca_per_capita_populacja.png', width = 11, height = 7) | |
# add info about region from world bank metadata | |
# for example for https://data.worldbank.org/indicator/SP.POP.TOTL | |
metadata <- read.csv(METADATA_FILE, stringsAsFactors = F) | |
#adjust couple of non-matching names | |
parliament_seats$Country[parliament_seats$Country=="Korea, South"]="Korea, Rep." | |
parliament_seats$Country[parliament_seats$Country=="Korea, North"]="Korea, Dem. People's Rep." | |
parliament_seats$Country[parliament_seats$Country=="Iran"]="Iran, Islamic Rep." | |
parliament_seats$Country[parliament_seats$Country=="Russia"]="Russian Federation" | |
parliament_seats$Country[parliament_seats$Country=="Laos"]="Lao PDR" | |
parliament_seats$Country[parliament_seats$Country=="Venezuela"]="Venezuela, RB" | |
parliament_seats$Country[parliament_seats$Country=="Slovakia"]="Slovak Republic" | |
model_data <- parliament_seats %>% | |
inner_join(metadata, by = c('Country' = 'TableName')) | |
#new model with additional variable - Region | |
seats_pop_lm2 <- lm(I(log(`Total`))~log(`Population[2]`) + I(log(`Population[2]`)^2) + `Type` + Region, | |
data = model_data) | |
#we simplify model a bit | |
seats_pop_lm2 <- step(seats_pop_lm2, trace = F) | |
plot(seats_pop_lm) | |
summary(seats_pop_lm2) | |
seats_pop_lm2$residuals[model_data$Country=='Poland'] | |
exp(seats_pop_lm2$fitted.values)[model_data$Country=='Poland'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment