Last active
April 3, 2020 16:07
-
-
Save pontsuyu/53e934b81e8dfee55858d4201e1c2708 to your computer and use it in GitHub Desktop.
statcastデータにおけるlaunch_speed, launch_angleの欠測値について(前編)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# devtools::install_github("pontsuyu/statcastr") | |
# install.packages("patchwork") | |
library(statcastr) | |
library(patchwork) | |
# statcastデータの取得(30分くらいかかるかも) | |
dat17 <- scrape_statcast("2017-04-02", "2017-10-01") | |
dat18 <- scrape_statcast("2018-03-29", "2018-09-30") | |
dat19 <- scrape_statcast("2019-03-28", "2019-09-29") | |
dat <- rbind(dat17, dat18, dat19) | |
saveRDS(dat, "dat17to19.rds") # 保存 | |
# dat <- readRDS("dat17to19.rds") | |
glimpse(dat) | |
# 球場情報の取得 | |
gd <- dat$game_date %>% unique | |
game_pks <- purrr::map_dfr(gd, get_game_pk_info) | |
saveRDS(game_pks, "game_pks.rds") | |
# game_pks <- readRDS("game_pks.rds") | |
# データを投球情報,打球位置がわかる打球に絞り、球場情報を追加 | |
datx <- dat %>% | |
filter(type == "X", !is.na(hc_x), !is.na(zone)) %>% | |
mutate( | |
hc_x = as.numeric(hc_x), | |
hc_y = as.numeric(hc_y) | |
) %>% | |
left_join(game_pks %>% | |
select(game_pk, park=venue.name) %>% | |
distinct()) | |
# 可視化用関数 | |
graph <- function(datx){ | |
datx_graph <- datx %>% | |
mutate( | |
launch_speed = round(launch_speed), | |
launch_angle = round(launch_angle) | |
) %>% | |
count(launch_speed, launch_angle) | |
ue <- ggplot(datx, aes(launch_speed)) + | |
geom_histogram(binwidth = 1) + | |
xlab(NULL) | |
migi <- ggplot(datx, aes(launch_angle)) + | |
geom_histogram(binwidth = 1) + | |
coord_flip() + | |
xlab(NULL) | |
g <- ggplot(datx_graph, aes(launch_speed, launch_angle, fill=n)) + | |
geom_tile() + | |
scale_fill_gradient2(low = "white", mid = "pink", high = "red", midpoint = 7500) + | |
theme(legend.position = "bottom") | |
ue + plot_spacer() + g + migi + plot_layout(2, 2, widths = c(6,4), heights = c(3,7)) | |
} | |
p <- graph(datx) | |
p | |
ggsave("plot.png", p, width=8, height=6) | |
# 頻度の多い組み合わせ | |
candidate <- datx %>% | |
count(launch_speed, launch_angle) %>% | |
ungroup() %>% | |
arrange(desc(n)) %>% | |
mutate(cumsum = cumsum(n), | |
cumsumper = round(cumsum/NROW(.)*100,2)) %>% | |
head(30) | |
candidate %>% | |
as.data.frame() | |
# TOP15とNA | |
combi <- candidate %>% | |
slice(1:15,29) | |
missingdata <- datx %>% | |
left_join(combi %>% | |
mutate(flg=1, flg2=row_number())) %>% | |
mutate(launch_speed = ifelse(is.na(flg), launch_speed, NA), | |
launch_angle = ifelse(is.na(flg), launch_angle, NA)) | |
# チェック | |
table(missingdata$flg);table(missingdata$flg2) | |
p2 <- graph(missingdata) | |
p2 | |
ggsave("plot2.png", p2, width=8, height=6) | |
# データの保存 | |
saveRDS(missingdata, "missingdata.rds") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 共変量探索をやっていき | |
# devtools::install_github("pontsuyu/pitchRx2") | |
# install.packages(c("tidyverse", "rpart", "partykit")) | |
library(tidyverse) | |
library(rpart) | |
library(partykit) | |
missingdata <- readRDS("missingdata.rds") %>% | |
mutate(hc_x = hc_x - 125.42, | |
hc_y = 198.27 - hc_y, # 原点をホームベースにする | |
flg = ifelse(is.na(flg), 0, flg), | |
flg2 = ifelse(is.na(flg2), 0, flg2)) | |
# 補助変数探索 | |
func <- function(var){ | |
missingdata %>% | |
group_by_(.dots = var) %>% | |
summarise(na = sum(flg), | |
n = n(), | |
per = round(na/n*100,2)) %>% | |
arrange(desc(per)) | |
} | |
## 1. 打球結果 | |
func("bb_type") | |
## 犠牲バントとゴロ系の打球が多い | |
## 2. 打球結果 | |
func("events") | |
## 3. 球場 | |
func("park") | |
## 4. 打球位置 | |
temp <- func(list(hc_x = "round(hc_x)", hc_y = "round(hc_y)")) | |
p <- ggplot(temp, aes(hc_x, hc_y, fill=per)) + | |
geom_tile() | |
p | |
ggsave("batted_ball_location.png", p, width=8, height=6) | |
## 投手前とファールフライは欠損率高い | |
## 5. 球種 | |
func("pitch_name") | |
## 関係なさそう | |
## 6. 球速 | |
func(list(release_speed = "round(release_speed)")) %>% | |
filter(n>=1000) | |
## 関係なさそう | |
## 7. 試合日 | |
func("game_year") | |
func("game_date") | |
## 関係なさそう | |
## 8. 利き腕 | |
func("p_throws") | |
## 関係なさそう | |
## 9.投手 | |
func("pitcher") %>% | |
filter(n>=1000) | |
## 関係ありそう | |
## 10. 打者左右 | |
func("stand") | |
## 関係なさそう | |
## 11.打者 | |
func("batter") %>% | |
filter(n>=1000) | |
## 関係ありそう | |
## 12.ゾーン | |
func(list("pitch_name","zone")) | |
func("zone") | |
## 関係ありそう | |
# 決定木 | |
missingdata <- missingdata %>% | |
mutate(r = sqrt(hc_x^2 + hc_y^2), # 極座標変換 | |
theta = atan(hc_y / hc_x)) | |
as <- function(x) as.character(as.numeric(as.factor(x))) | |
rp <- rpart(as.factor(flg) ~ as(park) + as(events) + | |
bb_type + zone + r + abs(theta), missingdata, | |
method="class", control = rpart.control(cp = 0.005)) | |
plotcp(rp) | |
rp2 <- prune(rp, cp = 0.0064) | |
rp2 | |
plot(as.party(rp2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment