Created
November 28, 2016 13:05
-
-
Save bayesball/b75689692c32c50ae39cd16048bd3dca to your computer and use it in GitHub Desktop.
Graphing the birthplaces of the 2016 MLB players
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Interesting table that presents the number of 2016 MLB players | |
# from each of 50 US States. | |
# http://www.baseball-almanac.com/players/birthplace.php?y=2016 | |
# Interested in mapping the data | |
# Use XML package to read in data | |
library(XML) | |
d <- readHTMLTable("http://www.baseball-almanac.com/players/birthplace.php?y=2016") | |
# put these data in a data frame | |
# first convert to character type and make the values into a | |
# long vector of 50 elements | |
d1 <- d[[1]][14:30, 2:4] | |
for (j in 1:3) | |
d1[, j] <- as.character(d1[, j]) | |
d1 <- unlist(d1) | |
# extract the numeric part of the string | |
library(stringr) | |
values <- as.numeric(str_extract(d1, '[0-9]+')) | |
# extract the state names | |
state_names <- str_extract(d1, '[a-zA-z ]+') | |
# remove DC from each vector | |
values <- values[-48] | |
state_names <- state_names[-48] | |
# create the data frame | |
final <- data.frame(State=state_names, | |
N_Player=as.numeric(values)) | |
# also need populations of all states | |
# here I try the htmltab package to extract the data information | |
library(htmltab) | |
p <- htmltab("https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population") | |
# choose the rows that just correspond to states | |
# remove commas from population numbers | |
p <- p[c(1:29, 31:49, 51:52), ] | |
p <- p[, 3:4] | |
names(p) <- c("State", "Population") | |
p$Population <- as.numeric(str_replace_all(p$Population, ",", "")) | |
# merge the population data with the MLB data | |
library(dplyr) | |
final_data <- inner_join(final, p) | |
# compute percent MLB | |
final_data <- mutate(final_data, | |
Pct_MLB = 100 * N_Player / Population, | |
state=tolower(State)) | |
# do the mapping | |
library(mapdata) | |
states <- map_data("state") | |
states <- inner_join(states, final_data, by=c("region"="state")) | |
library(ggplot2) | |
TH <- theme( | |
plot.title = element_text( | |
colour = "red", | |
size = 18, | |
hjust = 0.5, | |
vjust = 0.8, | |
angle = 0 | |
) | |
) | |
ggplot(data = states) + | |
geom_polygon(aes(x = long, y = lat, | |
fill = Pct_MLB, group = group), color = "white") + | |
coord_fixed(1.3) + | |
scale_fill_gradient(low = "yellow", high = "red") + | |
ggtitle("Density of 2016 MLB Players by State") + TH | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment