Skip to content

Instantly share code, notes, and snippets.

@andresAlvarado
Last active April 3, 2017 01:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save andresAlvarado/b63f861dac5b2e12bcd059fa08ec410f to your computer and use it in GitHub Desktop.
Save andresAlvarado/b63f861dac5b2e12bcd059fa08ec410f to your computer and use it in GitHub Desktop.
library( package = 'dplyr' )
library( package = 'data.table' )
# Create vectors that will store column classes for the game and event files
l_e_cols <- c( 'character' # GAME_ID
, rep( x = 'NULL', times = 13 )
, 'character' # PIT_ID
, rep( x = 'NULL', times = 25 )
, 'numeric' # EVENT_OUTS_CT
, rep( x = 'NULL', times = 58 )
, 'character' # FLD_TEAM_ID
, rep( x = 'NULL', times = 58 )
)
l_g_cols <- c( rep( x = 'NULL', times = 8 )
, 'character' # HOME_TEAM_ID
, rep( x = 'NULL', times = 76 )
, 'character' # HOME_TEAM_LEAGUE_ID
, rep( x = 'NULL', times = 93 )
)
# Create vectors that will store column names for the game and event files
l_e_names <- c( 'GAME_ID', 'PIT_ID', 'EVENT_OUTS', 'TEAM_ID' )
l_g_names <- c( 'TEAM_ID','LEAGUE_ID' )
# Load the 1990 season event file into the environment
d_e_1990 <- fread( input = 'all1990.csv'
, sep = ','
, header = T
, colClasses = l_e_cols
, col.names = l_e_names
)
# Load the 1990 season game file into the environment
d_g_1990 <- fread( input = 'games1990.csv'
, sep = ','
, header = T
, colClasses = l_g_cols
, col.names = l_g_names
)
# Game dataset ( d_g_1990 ) has a lot of duplicate records, so get unique observations
d_g_1990 <- distinct( .data = d_g_1990, TEAM_ID, LEAGUE_ID )
# Associate Game dataset and Event dataset
d_1990 <- inner_join( x = d_e_1990, y = d_g_1990, by = c('TEAM_ID') )
# Get IP by every pitcher for every game played in the 1990 season.
d_g_ip <- group_by( .data = d_1990, GAME_ID, PIT_ID ) %>%
summarise( O = sum( x = EVENT_OUTS, na.rm = T ) ) %>%
mutate( IP = O %/%3 + O %% 0.3 )
# Get IP by every pitcher in the 1990 season.
d_s_ip <- group_by( .data = d_1990, PIT_ID ) %>%
summarise( O = sum( x = EVENT_OUTS, na.rm = T ) ) %>%
mutate( IP = O %/%3 + O %% 0.3 )
# Get IP by every team in the 1990 season.
d_t_ip <- group_by( .data = d_1990, TEAM_ID, LEAGUE_ID ) %>%
summarise( O = sum( x = EVENT_OUTS, na.rm = T ) ) %>%
mutate( IP = O %/%3 + O %% 0.3 )
# Get IP by the AL and NL leagues in the 1990 season.
d_l_ip <- group_by( .data = d_1990, LEAGUE_ID ) %>%
summarise( O = sum( x = EVENT_OUTS, na.rm = T ) ) %>%
mutate( IP = O %/%3 + O %% 0.3 )
# Get the total of IP in the 1990 season
d_mlb_ip <- mutate( .data = d_1990, YEAR = 1990 ) %>%
group_by( YEAR ) %>%
summarise( O = sum( x = EVENT_OUTS, na.rm = T ) ) %>%
mutate( IP = O %/%3 + O %% 0.3 )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment