Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
library( package = "data.table" )
library( package = "plyr" )
library( package = "dplyr" )
library( package = "ggplot2" )
f_runs <- function( p_file )
{
g_file <- fread( input = p_file
, header = T
, sep = ','
, na.strings = ''
, stringsAsFactors = F
, colClasses = c( 'character' # GAME_ID 0
, rep( x = 'NULL', 6 ) # 6
, 'character' # AWAY_TEAM_ID 7
, 'character' # HOME_TEAM_ID 8
, 'character' # PARK_ID 9
, rep( x = 'NULL', 24) # 34
, 'integer' # AWAY_SCORE_CT 35
, 'integer' # HOME_SCORE_CT 36
, rep('NULL', 143 ) # 178
)
)
mlb_data <- g_file %>%
mutate( YEAR = as.integer( x = substr( x = GAME_ID, start = 4, stop = 7) ) ) %>%
select( YEAR, AWAY_TEAM_ID, HOME_TEAM_ID, PARK_ID, AWAY_SCORE_CT, HOME_SCORE_CT )
r_home <- mlb_data %>%
group_by( YEAR, PARK_ID, HOME_TEAM_ID ) %>%
summarise( H_RS = sum( x = HOME_SCORE_CT )
, H_RA = sum( x = AWAY_SCORE_CT )
, H_G = n()
) %>%
rename( TEAM_ID = HOME_TEAM_ID )
r_away <- mlb_data %>%
group_by( YEAR, AWAY_TEAM_ID ) %>%
summarise( A_RA = sum( x = HOME_SCORE_CT )
, A_RS = sum( x = AWAY_SCORE_CT )
, A_G = n()
) %>%
rename( TEAM_ID = AWAY_TEAM_ID )
inner_join( x = r_home
, y = r_away
, by = c("YEAR", "TEAM_ID")
)
}
f_park_factors <- function( p_year, p_data, p_hist )
{
p_data %>%
filter( YEAR <= p_year & YEAR >= p_year - p_hist + 1 ) %>%
group_by( TEAM_ID, PARK_ID ) %>%
summarise( H_RS = sum( x = H_RS )
, H_RA = sum( x = H_RA )
, H_G = sum( x = H_G )
, A_RS = sum( x = A_RS )
, A_RA = sum( x = A_RA )
, A_G = sum( x = A_G )
, YEARS = n()
) %>%
mutate( YEAR = p_year
, PK_FACTOR = ((H_RS + H_RA)/H_G)/((A_RS + A_RA)/A_G)
)
}
f_names <- function( p_pk_factors )
{
p_file <- fread( input = "./parks.csv"
, header = T
, sep = ','
, na.strings = ''
, stringsAsFactors = F
, colClasses = c( 'character' # BALL_PARK_ID
, 'character' # BALL_PARK_NAME
, rep( x = 'NULL', 7 ) # 9
)
, col.names = c( 'PARK_ID', 'PARK_NAME' )
)
inner_join( x = p_file
, y = p_pk_factors
, by = c("PARK_ID")
) %>%
select( YEAR, PARK_ID, PARK_NAME, TEAM_ID, YEARS, H_RS, H_RA, H_G, A_RS, A_RA, A_G, PK_FACTOR )
}
file_patt <- '*.csv'
# Considering the park_factors directory is your current working dir.
mlb_files <- list.files( path = './season'
, pattern = file_patt
, full.names = T
)
mlb_runs <- ldply( .data = mlb_files
, .fun = f_runs
)
mlb_years <- unique( x = mlb_runs$YEAR)
pk_factors <- ldply( .data = mlb_years
, .fun = f_park_factors
, p_data = mlb_runs
, p_hist = 1
)
pk_factors <- f_names( p_pk_factors = pk_factors )
( ggplot()
+ geom_point( data = pk_factors
, aes( x = PARK_NAME
, y = PK_FACTOR
, col = YEAR
)
)
+ geom_hline( aes( yintercept = 1 )
, col = "goldenrod"
, size = 1
)
+ labs( x = "Park Name"
, y = "Park Factor"
, col = "Year"
)
+ theme( axis.text.x = element_text(angle = 60, hjust = 1) )
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.