Skip to content

Instantly share code, notes, and snippets.

@bdilday
Last active May 17, 2018 01:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bdilday/c378fe61d93068daa3b312f44f13d331 to your computer and use it in GitHub Desktop.
Save bdilday/c378fe61d93068daa3b312f44f13d331 to your computer and use it in GitHub Desktop.
scrape statcast expected stats
library(dplyr)
library(ggplot2)
library(rvest)
library(jsonlite)
scrape_statcast_expected_stats = function(year=2018, min_pa=25) {
url = sprintf("https://baseballsavant.mlb.com/expected_statistics?type=batter&year=%s&position=&team=&min=%d", year, min_pa)
h = xml2::read_html(url)
s = html_nodes(h, "script")[[10]]
sm = str_match(html_text(s), "data\\s+=\\s+(\\[.+\\])")
sdf = jsonlite::fromJSON(sm[[2]])
numeric_cols =
c("ba", "est_ba",
"slg", "est_slg", "woba", "est_woba", "wobacon", "est_wobacon",
"wobacon_minus_est_wobacon_diff", "slg_minus_est_slg_diff", "woba_minus_est_woba_diff",
"ba_minus_est_ba_diff", "est_wobacon_minus_wobacon_diff", "est_ba_minus_ba_diff",
"est_woba_minus_woba_diff", "est_slg_minus_slg_diff")
integer_cols =
c("year", "team_id", "pa", "bip", "age", "position_name", "pos", "player_id")
for (d in numeric_cols) {
sdf[[d]] = as.numeric(as.character(sdf[[d]]))
}
for (d in integer_cols) {
sdf[[d]] = as.integer(as.character(sdf[[d]]))
}
sdf
}
x_df = scrape_expected(min_pa=1)
x_df %>% ggplot(aes(x= pa, y=woba - est_woba)) + geom_point() + xlim(1, 200) + ylim(-0.2, 0.15) + geom_smooth() + theme_minimal(base_size = 16) + geom_hline(yintercept=0) + labs(x="PA", y="wOBA - est_woba", title="Expected vs Observed wOBA - 2018")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment