Skip to content

Instantly share code, notes, and snippets.

@MonkmanMH
Last active August 29, 2015 14:04
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MonkmanMH/3c0da6afd58eb61e2c51 to your computer and use it in GitHub Desktop.
Save MonkmanMH/3c0da6afd58eb61e2c51 to your computer and use it in GitHub Desktop.
dplyr testing and goofing
#
# setwd("D:/R_the software/datatrials/Lahman")
#
require(Lahman)
require(dplyr)
#
# throwing by position
# version 1 - "merge"
MasterFielding <- data.frame(merge(Master, Fielding, by="playerID"))
MasterFielding <- merge(Master, Fielding, by="playerID")
system.time(MasterFielding <- merge(Master, Fielding, by="playerID"))
# the dplyr version -- faster
MasterFielding <- inner_join(Fielding, Master, by="playerID")
system.time(MasterFielding <- inner_join(Fielding, Master, by="playerID"))
#
#
# a count of games played, by position
MasterFielding <- subset(MasterFielding, POS != "OF" & yearID > "1944")
#
MasterFielding %.%
group_by(playerID, POS, throws) %.%
summarise(gamecount = sum(G)) %.%
arrange(desc(gamecount)) %.%
head(5)
#
MasterFielding <- inner_join(Fielding, Master, by="playerID")
# select only those seasons since 1945 and
# omit the records that are OF summary (i.e. leave the RF, CF, and LF)
MasterFielding <- subset(MasterFielding, POS != "OF" & yearID > "1944")
#
Player_games <- MasterFielding %.%
group_by(playerID, POS, throws) %.%
summarise(gamecount = sum(G))
Player_POS <- Player_games %.%
group_by(POS, throws) %.%
summarise(playercount = length(gamecount))
head(Player_POS)
#
#
#
# ################################
#
# from dplyr reference manual
# http://cran.r-project.org/web/packages/dplyr/index.html
require(dplyr)
#
if (require("Lahman")) {
batting_tbl <- tbl_df(Batting)
tally(group_by(batting_tbl, yearID))
tally(group_by(batting_tbl, yearID), sort = TRUE)
# Multiple tallys progressively role up the groups
plays_by_year <- tally(group_by(batting_tbl, playerID, stint), sort = TRUE)
tally(plays_by_year, sort = TRUE)
tally(tally(plays_by_year))
# This looks a little nicer if you use the infix %.% operator
batting_tbl %.% group_by(playerID) %.% tally(sort = TRUE)
}
#
#
# ########################################
# DPLYR experimentation
#
require(Lahman)
require(vcd)
require(dplyr)
#
# creating new objects without reading into memory
PlayerFielding <- group_by(Fielding, playerID)
PlayerSeasons <- summarise(PlayerFielding, total = length(POS))
head(PlayerSeasons)
#
#
Batting %.%
group_by(playerID) %.%
summarise(total = sum(G)) %.%
arrange(desc(total)) %.%
head(5)
#
Fielding %.%
group_by(playerID, POS) %.%
summarise(total = sum(G)) %.%
arrange(desc(total)) %.%
head(5)#
#
PlayerGames <- (Fielding %.%
group_by(playerID, POS) %.%
summarise(total = sum(G)) %.%
arrange(desc(total))
)
head(PlayerGames)
#
# create MasterFielding using dplyr
MasterFielding <- inner_join(Fielding, Master, by="playerID")
#
# summarize MasterFielding by (a) filter only years > 1945 and (b) select specific variables
MasterFielding45 <- (
MasterFielding %.%
filter(yearID > 1945) %.%
select(playerID, yearID, POS, G, throws)
)
head(MasterFielding45)
#
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment