Skip to content

Instantly share code, notes, and snippets.

@psobczyk
Created October 10, 2018 19:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save psobczyk/48efeaf4214b419524ccf6cda2e0eeb1 to your computer and use it in GitHub Desktop.
Save psobczyk/48efeaf4214b419524ccf6cda2e0eeb1 to your computer and use it in GitHub Desktop.
analysing number of parliament members worldwide
require(dplyr)
require(rvest)
require(ggplot2)
require(ggthemes)
url <- "https://en.wikipedia.org/wiki/List_of_legislatures_by_number_of_members"
html_downloaded <- read_html(url)
tables<- url %>% read_html %>%
html_table(fill = T)
parliament_seats <- tables[[1]]
clean_text <- function(text){
gsub(',', '', gsub('.*♠(.*)', '\\1', text))
}
parliament_seats <- parliament_seats %>%
mutate_at(vars(`Lowerhouse[1]`:`Population/seats`), funs(clean_text)) %>%
mutate_at(vars(`Lowerhouse[1]`:`Upperhouse[1]`, Total), funs(gsub('([0-9]*).*', '\\1', .))) %>%
mutate_at(vars(`Lowerhouse[1]`:`Population/seats`), funs(as.numeric)) %>%
replace_na(list(`Upperhouse[1]` = 0, `Lowerhouse[1]` = 0)) %>%
mutate(Total = `Lowerhouse[1]`+`Upperhouse[1]`,
`Population/seats`=`Population[2]`/Total)
ggplot(parliament_seats, aes(x=`Population[2]`, y=`Total`,
color = ifelse(Type == 'bicameral', 'Dwuizbowy', 'Jednoizbowy'))) +
geom_point() +
scale_x_log10(labels=scales::number_format()) +
scale_y_log10(labels=scales::number_format()) +
scale_color_viridis_d() +
theme_fivethirtyeight(base_family = 'Helvetica Neue Light') +
theme(axis.title = element_text(inherit.blank = F)) +
labs(x = NULL, y = 'Miejsc w parlamencie', color = NULL, title = 'Ludność kraju a liczba parlamentarzystów')
# fittin simple model
seats_pop_lm <- lm(I(log(`Total`))~log(`Population[2]`) + I(log(`Population[2]`)^2) + `Type`,
data = parliament_seats)
#quick diagnostics and summary
plot(seats_pop_lm, 1:2)
summary(seats_pop_lm)
#in Poland there is over representations
seats_pop_lm$residuals[parliament_seats$Country=='Poland']
#in reality it is 560
exp(seats_pop_lm$fitted.values)[parliament_seats$Country=='Poland']
parliament_seats %>%
mutate(type = ifelse(Country == 'Poland', 'Polska', ifelse(Type == 'bicameral', 'Dwuizbowy parlament', 'Jednoizbowy parlament'))) %>%
ggplot(aes(x=`Population[2]`, y=`Total`, color = type)) +
geom_point() +
scale_x_log10(labels=scales::number_format()) +
scale_y_log10(labels=scales::number_format()) +
scale_color_viridis_d() +
facet_wrap(~cut(`Population[2]`, c(0, 1e6, 3e7, 1e10), labels = c('poniżej miliona', 'mniej niż 20 milionów', 'powyżej 10 milionów')),
scales = 'free') +
theme_fivethirtyeight(base_family = 'Helvetica Neue Light') +
theme(axis.title = element_text(inherit.blank = F)) +
labs(x = NULL, color = NULL, y = 'Mieszkańców na jedno miejsce w Parlamencie', title = 'Ludność kraju a liczba parlamentarzystów') +
ggsave('miejsca_per_capita_populacja.png', width = 11, height = 7)
# add info about region from world bank metadata
# for example for https://data.worldbank.org/indicator/SP.POP.TOTL
metadata <- read.csv(METADATA_FILE, stringsAsFactors = F)
#adjust couple of non-matching names
parliament_seats$Country[parliament_seats$Country=="Korea, South"]="Korea, Rep."
parliament_seats$Country[parliament_seats$Country=="Korea, North"]="Korea, Dem. People's Rep."
parliament_seats$Country[parliament_seats$Country=="Iran"]="Iran, Islamic Rep."
parliament_seats$Country[parliament_seats$Country=="Russia"]="Russian Federation"
parliament_seats$Country[parliament_seats$Country=="Laos"]="Lao PDR"
parliament_seats$Country[parliament_seats$Country=="Venezuela"]="Venezuela, RB"
parliament_seats$Country[parliament_seats$Country=="Slovakia"]="Slovak Republic"
model_data <- parliament_seats %>%
inner_join(metadata, by = c('Country' = 'TableName'))
#new model with additional variable - Region
seats_pop_lm2 <- lm(I(log(`Total`))~log(`Population[2]`) + I(log(`Population[2]`)^2) + `Type` + Region,
data = model_data)
#we simplify model a bit
seats_pop_lm2 <- step(seats_pop_lm2, trace = F)
plot(seats_pop_lm)
summary(seats_pop_lm2)
seats_pop_lm2$residuals[model_data$Country=='Poland']
exp(seats_pop_lm2$fitted.values)[model_data$Country=='Poland']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment