Skip to content

Instantly share code, notes, and snippets.

@rlankenau
Created July 29, 2013 16:35
Show Gist options
  • Save rlankenau/6105632 to your computer and use it in GitHub Desktop.
Save rlankenau/6105632 to your computer and use it in GitHub Desktop.
Pig LoadFunc for RetroSheet data.
@Override
public InputFormat getInputFormat() throws IOException {
return new RetrosheetInputFormat();
}
@Override
public Tuple getNext() throws IOException {
RetrosheetPlayer[] home_players = new RetrosheetPlayer[11];
RetrosheetPlayer[] away_players = new RetrosheetPlayer[11];
RetrosheetPlayer[] defense = null;
Hashtable players = new Hashtable();
int current_outs = 0;
int home_score = 0;
int away_score = 0;
int atbat_of_game = 0;
int event_of_game = 0;
String last_batter = "";
String current_batter = "";
String runner_on_first = "";
String runner_on_second = "";
String runner_on_third = "";
System.out.println("getNext()");
Tuple game = tupleFactory.newTuple(41);
DataBag events = bagFactory.newDefaultBag();
try {
if(reader.nextKeyValue()) {
LongWritable k = (LongWritable)reader.getCurrentKey();
Text v = (Text)reader.getCurrentValue();
String record = v.toString();
/* Tokenize based on newlines */
for(String line : record.split("\n")) {
try {
String[] elems = line.split(",");
String linetype = elems[0].trim();
if (linetype.equals("com")) {
/* Comment. Skip for now */
} else if (linetype.equals("id")) {
/* ID record. If we've set the ID, this is an error */
/* Raw game id */
game.set(RetrosheetLoader.GAME_ID, elems[1]);
/* We can get the home team from the ID */
game.set(RetrosheetLoader.GAME_HOME_TEAM, elems[1].substring(0,3));
/* Day, Month, Year */
game.set(RetrosheetLoader.GAME_DATE_DAY, Integer.parseInt(elems[1].substring(3,7)));
game.set(RetrosheetLoader.GAME_DATE_MONTH, Integer.parseInt(elems[1].substring(7,9)));
game.set(RetrosheetLoader.GAME_DATE_YEAR, Integer.parseInt(elems[1].substring(9,11)));
int game_of_day = Integer.parseInt(elems[1].substring(11,12));
switch(game_of_day) {
case 0:
/* First game of the day, not a double header */
game.set(RetrosheetLoader.GAME_OF_DAY, 1);
game.set(RetrosheetLoader.GAME_IS_DOUBLE_HEADER, "no");
break;
case 1:
game.set(RetrosheetLoader.GAME_OF_DAY, 1);
game.set(RetrosheetLoader.GAME_IS_DOUBLE_HEADER, "yes");
break;
case 2:
game.set(RetrosheetLoader.GAME_OF_DAY, 2);
game.set(RetrosheetLoader.GAME_IS_DOUBLE_HEADER, "yes");
break;
}
} else if (linetype.equals("start")) {
/* Player start record. Add to the current players list */
try {
RetrosheetPlayer p = new RetrosheetPlayer(elems);
players.put(p.player_id, p);
if(p.home_team) {
home_players[p.position] = p;
} else {
away_players[p.position] = p;
}
} catch (Exception e) {
/*TODO: Log this */
}
} else if (linetype.equals("sub")) {
/* Player substitution. Replace the player in the list now */
try {
RetrosheetPlayer p = new RetrosheetPlayer(elems);
players.put(p.player_id, p);
if(p.home_team) {
home_players[p.position] = p;
} else {
away_players[p.position] = p;
}
} catch (Exception e) {
/*TODO: Log this */
}
} else if (linetype.equals("play")) {
try{
/* Play. Emit an event into the events list, update players on base, update score. */
Tuple currentPlay = tupleFactory.newTuple(39);
int possible_rbis = 0;
/* We can set event of game now. at-bat has to wait until we parse out stolen bases, etc. */
event_of_game++;
currentPlay.set(RetrosheetLoader.PLAY_EVENT_OF_GAME, event_of_game);
/* Check if the batter has changed. */
current_batter = elems[3].trim();
RetrosheetPlayer current_player = (RetrosheetPlayer)players.get(current_batter);
if(current_batter != last_batter)
{
atbat_of_game++;
}
currentPlay.set(RetrosheetLoader.PLAY_ATBAT_OF_GAME, atbat_of_game);
/* Set inning and whether it is top or bottom */
currentPlay.set(RetrosheetLoader.PLAY_INNING, Integer.parseInt(elems[1]));
if(elems[2].trim() == "0"){
currentPlay.set(RetrosheetLoader.PLAY_INNING_HALF, "top");
defense = away_players;
} else {
currentPlay.set(RetrosheetLoader.PLAY_INNING_HALF, "bottom");
defense = home_players;
}
/* Set the fielders */
currentPlay.set(RetrosheetLoader.PLAY_PITCHER, defense[1].player_id);
currentPlay.set(RetrosheetLoader.PLAY_CATCHER, defense[2].player_id);
currentPlay.set(RetrosheetLoader.PLAY_FIRST_BASEMAN, defense[3].player_id);
currentPlay.set(RetrosheetLoader.PLAY_SECOND_BASEMAN, defense[4].player_id);
currentPlay.set(RetrosheetLoader.PLAY_THIRD_BASEMAN, defense[5].player_id);
currentPlay.set(RetrosheetLoader.PLAY_SHORTSTOP, defense[6].player_id);
currentPlay.set(RetrosheetLoader.PLAY_LEFTFIELDER, defense[7].player_id);
currentPlay.set(RetrosheetLoader.PLAY_CENTERFIELDER, defense[8].player_id);
currentPlay.set(RetrosheetLoader.PLAY_RIGHTFIELDER, defense[9].player_id);
if (defense.length >=11 && defense[10] != null && defense[10].player_id != null)
currentPlay.set(RetrosheetLoader.PLAY_DESIGNATED_HITTER, defense[10].player_id);
else
currentPlay.set(RetrosheetLoader.PLAY_DESIGNATED_HITTER, "");
/* Set the runners on base */
currentPlay.set(RetrosheetLoader.PLAY_RUNNER_ON_FIRST, runner_on_first);
currentPlay.set(RetrosheetLoader.PLAY_RUNNER_ON_SECOND, runner_on_second);
currentPlay.set(RetrosheetLoader.PLAY_RUNNER_ON_THIRD, runner_on_third);
int number_on_base = 0;
if(!runner_on_first.equals(""))
number_on_base++;
if(!runner_on_second.equals(""))
number_on_base++;
if(!runner_on_third.equals(""))
number_on_base++;
currentPlay.set(RetrosheetLoader.PLAY_RUNNERS_ON_BASE, number_on_base);
currentPlay.set(RetrosheetLoader.PLAY_CURRENT_BATTER, current_batter);
current_player.at_bat_number++;
currentPlay.set(RetrosheetLoader.PLAY_CURRENT_BATTER_AT_BAT, current_player.at_bat_number);
currentPlay.set(RetrosheetLoader.PLAY_BATTER_POSITION, current_player.position);
try {
int count = Integer.parseInt(elems[4]);
currentPlay.set(RetrosheetLoader.PLAY_COUNT, count/10 + "-" + count%10);
} catch (Exception e) {
currentPlay.set(RetrosheetLoader.PLAY_COUNT, "Unknown");
}
currentPlay.set(RetrosheetLoader.PLAY_BATTER_HITS_SO_FAR, current_player.hits_so_far);
currentPlay.set(RetrosheetLoader.PLAY_BATTER_HBP_SO_FAR, current_player.hbp_so_far);
currentPlay.set(RetrosheetLoader.PLAY_BATTER_WALKS_SO_FAR, current_player.walks_so_far);
currentPlay.set(RetrosheetLoader.PLAY_BATTER_OUTS_SO_FAR, current_player.outs_so_far);
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_BATTERS_PITCHED_TO, defense[1].batters_pitched_to);
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_HITS_ALLOWED, defense[1].pitcher_hits_allowed);
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_WALKS_ALLOWED, defense[1].pitcher_walks_allowed);
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_WILD_PITCHES, defense[1].pitcher_wild_pitches);
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_BATTERS_BEANED, defense[1].pitcher_beans);
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_STRIKEOUTS, defense[1].pitcher_strikeouts);
currentPlay.set(RetrosheetLoader.PLAY_HOME_SCORE, home_score);
currentPlay.set(RetrosheetLoader.PLAY_AWAY_SCORE, away_score);
/* Parse the event itself */
Matcher m = event_pattern.matcher(elems[6]);
if(m.matches() != true) {
System.err.println("Couldn't parse event data: " + elems[6]);
} else {
/* Figure out player movement so we can update everything in order */
if(m.groupCount() >= 6 && m.group(5) != null && !m.group(5).equals("")) {
/* We have some player movement */
String[] runner_mvmt = m.group(5).split(";");
/* Scan the whole thing in case the movement is out of order */
for(int i=3;i>0;i--) {
for(int j=0;j<runner_mvmt.length;j++) {
if(runner_mvmt[j].startsWith(""+i)) {
/* Check if this is movement or an out. */
if(runner_mvmt[j].substring(1,2).equals("X")) {
/* Clear the runner */
current_outs++;
switch(i) {
case 1:
runner_on_first = "";
break;
case 2:
runner_on_second = "";
break;
case 3:
runner_on_third = "";
break;
}
} else if (runner_mvmt[j].substring(1,2).equals("-")) {
String newbasename = runner_mvmt[j].substring(2,3);
if(newbasename.equals("H")) {
/* Can't credit an RBI yet. Save as conditional RBI */
possible_rbis++;
if(current_player.home_team) {
home_score++;
} else {
away_score++;
}
switch(i) {
case 1:
runner_on_first = "";
break;
case 2:
runner_on_second = "";
break;
case 3:
runner_on_third = "";
break;
}
} else {
int newbase = Integer.parseInt(runner_mvmt[j].substring(2,3));
String moving_runner = "";
switch(i) {
case 1:
moving_runner = runner_on_first;
runner_on_first = "";
break;
case 2:
moving_runner = runner_on_second;
runner_on_second = "";
break;
case 3:
moving_runner = runner_on_third;
runner_on_third = "";
break;
}
switch(newbase) {
case 1:
runner_on_first = moving_runner;
break;
case 2:
runner_on_second = moving_runner;
break;
case 3:
runner_on_second = moving_runner;
break;
}
}
}
}
}
}
}
currentPlay.set(RetrosheetLoader.PLAY_BATTER_RBIS, current_player.rbis);
if(m.group(1) != null) {
if(m.group(1).equals("S")) {
runner_on_first = current_batter;
defense[1].pitcher_hits_allowed++;
current_player.rbis+=possible_rbis;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Single");
} else if (m.group(1).equals("D") || m.group(1).equals("DGR")) {
runner_on_second = current_batter;
defense[1].pitcher_hits_allowed++;
current_player.rbis+=possible_rbis;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Double");
} else if (m.group(1).equals("T")) {
runner_on_third = current_batter;
defense[1].pitcher_hits_allowed++;
current_player.rbis+=possible_rbis;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Triple");
} else if (m.group(1).equals("HR")) {
if(current_player.home_team) {
home_score++;
} else {
away_score++;
}
defense[1].pitcher_hits_allowed++;
current_player.rbis+=possible_rbis;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Home run");
} else if (m.group(1).equals("HP")) {
runner_on_first = current_batter;
defense[1].pitcher_beans++;
current_player.hbp_so_far++;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Hit by pitch");
} else if (m.group(1).equals("WP")) {
defense[1].pitcher_wild_pitches++;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Wild pitch");
} else if (m.group(1).equals("W") || m.group(1).equals("IW")) {
runner_on_first = current_batter;
defense[1].pitcher_walks_allowed++;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Walk");
} else if (m.group(1).equals("K")) {
defense[1].pitcher_strikeouts++;
current_player.strikeouts_so_far++;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Strikeout");
} else if (m.group(1).equals("NP")) {
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "No Play");
} else if (m.group(1).equals("")) {
/* Out */
current_player.outs_so_far++;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Out");
}
if (m.group(2) != null)
// This is fielder - 1-9
currentPlay.set(RetrosheetLoader.PLAY_FIELDER, m.group(2));
else
currentPlay.set(RetrosheetLoader.PLAY_FIELDER,"0");
if (m.group(3) != null) {
// This is type of ball hit:
// "L" line drive, "G" grounder,etc
switch(m.group(3).charAt(0)) {
case 'L': currentPlay.set(PLAY_TRAJECTORY,"Line drive"); break;
case 'F': currentPlay.set(PLAY_TRAJECTORY,"Fly ball"); break;
case 'G': currentPlay.set(PLAY_TRAJECTORY,"Grounder"); break;
case 'P': currentPlay.set(PLAY_TRAJECTORY,"Pop fly"); break;
case 'B': currentPlay.set(PLAY_TRAJECTORY,"Bunt"); break;
default: currentPlay.set(PLAY_TRAJECTORY,m.group(3));
}
}
else
currentPlay.set(PLAY_TRAJECTORY,"0");
} else {
/* Out */
current_player.outs_so_far++;
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Out");
}
/* Write out rbis and rbis_so_far. */
currentPlay.set(RetrosheetLoader.PLAY_RBIS_ON_PLAY, possible_rbis);
}
events.add(currentPlay);
} catch (Exception e) {
System.err.println("Error with play: " + e);
e.printStackTrace();
}
} else if (linetype.equals("version")) {
/* File version info. Skip for now */
} else if (linetype.equals("info")) {
/* Game info. Add to the output tuple */
String infotype = elems[1].trim();
if (infotype.equals("hometeam")) {
/* Ignore, this is already set by ID */
} else if (infotype.equals("site")) {
game.set(RetrosheetLoader.GAME_SITE, elems[2]);
} else if (infotype.equals("date")) {
/* Already set in the ID */
} else if (infotype.equals("number")) {
/* Already set in the ID */
} else if (infotype.equals("daynight")) {
game.set(RetrosheetLoader.GAME_DAY_NIGHT, elems[2]);
} else if (infotype.equals("starttime")) {
String[] time_elems = elems[2].split(":");
int hour = 0, minutes = 0;
if(time_elems.length == 2) {
hour = Integer.parseInt(time_elems[0]);
minutes = Integer.parseInt(time_elems[1].substring(0,2));
if(!time_elems[1].substring(2,4).equals("AM")) {
hour+=12;
}
} else {
if(elems[2].length() > 2) {
int length = elems[2].length();
minutes = Integer.parseInt(elems[2].substring(length-2, length));
hour = Integer.parseInt(elems[2].substring(0, length-2));
}
}
game.set(RetrosheetLoader.GAME_START_HOUR, hour);
game.set(RetrosheetLoader.GAME_START_MINUTES, minutes);
} else if( infotype.equals("visteam")){
game.set(RetrosheetLoader.GAME_AWAY_TEAM, elems[2]);
} else if (infotype.equals("usedh")) {
game.set(RetrosheetLoader.GAME_USE_DESIGNATED_HITTER, elems[2]);
} else if (infotype.equals("umphome")) {
game.set(RetrosheetLoader.GAME_HOME_UMPIRE, elems[2]);
} else if (infotype.equals("ump1b")) {
game.set(RetrosheetLoader.GAME_1ST_BASE_UMPIRE, elems[2]);
} else if (infotype.equals("ump2b")) {
game.set(RetrosheetLoader.GAME_2ND_BASE_UMPIRE, elems[2]);
} else if (infotype.equals("ump3b")) {
game.set(RetrosheetLoader.GAME_3RD_BASE_UMPIRE, elems[2]);
} else if (infotype.equals("umplf")) {
game.set(RetrosheetLoader.GAME_LEFT_FIELD_UMPIRE, elems[2]);
} else if (infotype.equals("umprf")) {
game.set(RetrosheetLoader.GAME_RIGHT_FIELD_UMPIRE, elems[2]);
} else if (infotype.equals("wp")) {
game.set(RetrosheetLoader.GAME_WINNING_PITCHER, elems[2]);
} else if (infotype.equals("lp")) {
game.set(RetrosheetLoader.GAME_LOSING_PITCHER, elems[2]);
} else if (infotype.equals("howscored")) {
game.set(RetrosheetLoader.GAME_HOW_SCORED, elems[2]);
} else if (infotype.equals("scorer")) {
game.set(RetrosheetLoader.GAME_SCORER, elems[2]);
} else if (infotype.equals("inputter")) {
game.set(RetrosheetLoader.GAME_INPUTTER, elems[2]);
} else if (infotype.equals("translator")) {
game.set(RetrosheetLoader.GAME_TRANSLATOR, elems[2]);
} else if (infotype.equals("pitches")) {
game.set(RetrosheetLoader.GAME_HAS_PITCHES, elems[2]);
} else if (infotype.equals("winddir")) {
game.set(RetrosheetLoader.GAME_WIND_DIRECTION, elems[2]);
} else if (infotype.equals("windspeed")) {
game.set(RetrosheetLoader.GAME_WIND_SPEED, Integer.parseInt(elems[2]));
} else if (infotype.equals("temp")) {
game.set(RetrosheetLoader.GAME_TEMPERATURE, Integer.parseInt(elems[2]));
} else if (infotype.equals("sky")) {
game.set(RetrosheetLoader.GAME_SKY_CONDITION, elems[2]);
} else if (infotype.equals("fieldcond")) {
game.set(RetrosheetLoader.GAME_FIELD_CONDITION, elems[2]);
} else if (infotype.equals("precip")) {
game.set(RetrosheetLoader.GAME_PRECIPITATION, elems[2]);
} else if (infotype.equals("attendance")) {
game.set(RetrosheetLoader.GAME_ATTENDANCE, Integer.parseInt(elems[2]));
} else if (infotype.equals("timeofgame")) {
game.set(RetrosheetLoader.GAME_DURATION, Integer.parseInt(elems[2]));
} else if (infotype.equals("save")) {
game.set(RetrosheetLoader.GAME_COUNTED_AS_SAVE, elems[2]);
}
} else if (linetype.equals("data")) {
/* Other game data. Generally earned runs for the pitchers */
}
} catch (Exception e) {
System.err.println("Malformed data: '" + line + "' exception: " + e);
}
}
game.set(RetrosheetLoader.GAME_EVENTS, events);
game.set(RetrosheetLoader.GAME_FINAL_HOME_SCORE, home_score);
game.set(RetrosheetLoader.GAME_FINAL_AWAY_SCORE, away_score);
game.set(RetrosheetLoader.GAME_EVENTS_IN_GAME, event_of_game);
game.set(RetrosheetLoader.GAME_BATTERS_IN_GAME, atbat_of_game);
if(home_score>away_score) {
game.set(RetrosheetLoader.GAME_WINNER, game.get(RetrosheetLoader.GAME_HOME_TEAM));
} else if (away_score>home_score) {
game.set(RetrosheetLoader.GAME_WINNER, game.get(RetrosheetLoader.GAME_AWAY_TEAM));
}
return game;
}
} catch (Exception e) {
/*TODO: Log this */
throw new IOException("Error parsing", e);
}
return null;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment