Skip to content

Instantly share code, notes, and snippets.

Created August 4, 2015 14:46
What would you like to do?
Compute streaks of consecutive batters retired from Retrosheet play-by-play files
import pandas
import glob
if __name__ == '__main__':
events = pandas.concat([ pandas.read_csv(fn)
for fn in glob.glob("event*.csv") ],
events = events[events.BAT_EVENT_FL=="T"]
events["BAT_TEAM_ID"] = events["BAT_TEAM_ID"].replace("MON", "WAS")
events["BAT_TEAM_ID"] = events["BAT_TEAM_ID"].replace("CAL", "ANA")
events["BAT_TEAM_ID"] = events["BAT_TEAM_ID"].replace("FLO", "MIA")
games = pandas.concat([ pandas.read_csv(fn)
for fn in glob.glob("game*.csv") ],
events = pandas.merge(events, games, how='left', on='GAME_ID')
events.sort([ "RESP_PIT_ID", "BAT_TEAM_ID", "GAME_DT", "GAME_CT" ],
events["IS_OUT"] = ((events.EVENT_CD==2) | (events.EVENT_CD==3) |
events["NEW_STREAK"] = ((events.RESP_PIT_ID!=events.RESP_PIT_ID.shift()) |
(events.BAT_TEAM_ID!=events.BAT_TEAM_ID.shift()) |
events["STREAK_NO"] = events.NEW_STREAK.cumsum()
events["STREAK_LEN"] = events.groupby([ "STREAK_NO" ]).cumcount()+1
streaks = events[[ "RESP_PIT_ID", "BAT_TEAM_ID", "GAME_DT", "IS_OUT", "STREAK_NO", "STREAK_LEN" ]]
streaks = streaks.groupby([ "RESP_PIT_ID", "BAT_TEAM_ID", "STREAK_NO", "IS_OUT" ]).agg({ "GAME_DT": (min, max), "STREAK_LEN": max }).reset_index()
streaks.columns = [ "PITCHER", "OPPONENT", "STREAK_ID", "IS_OUT",
streaks.sort("STREAK_LEN", ascending=False, inplace=True)
people = pandas.read_csv("people.csv")
streaks = pandas.merge(streaks, people, how='left',
left_on=[ 'PITCHER' ],
right_on= [ 'KEY_RETRO' ])
streaks = streaks[[ "PITCHER", "NAME_LAST", "NAME_FIRST",
streaks[streaks.STREAK_LEN>=27].to_csv("streaks.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment