Last active
February 7, 2016 21:28
-
-
Save tangotiger/ad3fcd370dcda1d39496 to your computer and use it in GitHub Desktop.
Parse the Play by Play file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re | |
print("Parse start") | |
first_game_id = 1 #+665 | |
last_game_id = 668 #-2 | |
season_id = '20152016' | |
subseason_id = '02' | |
datafile_id = 'PL' | |
header_row_main = ( | |
'SEASON_ID' | |
'|SUBSEASON_ID' | |
'|GAME_ID' | |
'|EVENT_ID' | |
'|PERIOD_CD' | |
'|STRENGTH_CD' | |
'|GAME_TM' | |
'|PLAY_TM' | |
'|REMAIN_TM' | |
'|EVENT_CD' | |
) | |
header_row_rest = ( | |
'|FULL_EVENT_CD' | |
'|PLAY_ELEMENT_CT' | |
'|TEAM_ID' | |
'|JERSEY_ID' | |
'|PLAYER_TX' | |
'|ASSIST_JERSEY_ID' | |
'|ASSIST_PLAYER_TX' | |
'|ASSIST2_JERSEY_ID' | |
'|ASSIST2_PLAYER_TX' | |
'|MATE_JERSEY_ID' | |
'|MATE_PLAYER_TX' | |
'|OPP_TEAM_ID' | |
'|OPP_JERSEY_ID' | |
'|OPP_PLAYER_TX' | |
'|AWAY_TEAM_ID' | |
'|AWAY_JERSEY_ID' | |
'|AWAY_PLAYER_TX' | |
'|HOME_TEAM_ID' | |
'|HOME_JERSEY_ID' | |
'|HOME_PLAYER_TX' | |
'|WINNER_TEAM_ID' | |
'|ZONE_CD' | |
'|SHOT_CD' | |
'|OUTCOME_SHOT_CD' | |
'|DISTANCE_CT' | |
'|PENALTY_CD' | |
'|MINUTES_PENALTY_CT' | |
'|MINUTES2_PENALTY_CT' | |
'|REASON_CD' | |
'|REASON2_CD' | |
'|REASON3_CD' | |
'|RESULT_CD' | |
'|HOURS_TM' | |
'|MINUTES_TM' | |
'|TIMEZONE_CD' | |
'|TIMEZONE_TX' | |
'|SEASON_GOAL_CT' | |
'|SEASON_ASSIST_CT' | |
'|SEASON_ASSIST2_CT' | |
) | |
def parse_play(play_tx): | |
# players with multi names collapsed into one name, temporarily | |
play_tx = re.sub('DI GIUSEPPE','DI_GIUSEPPE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('DE HAAN','DE_HAAN',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('DE LA ROSE','DE_LA_ROSE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('VAN RIEMSDYK','VAN_RIEMSDYK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('DEL ZOTTO','DEL_ZOTTO',play_tx, flags=re.IGNORECASE) | |
# special keywords to merge with prior words | |
play_tx = re.sub('. Zone','_Zone',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub(' Start-','_Start',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub(' End-','_End',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub(' time:','_Time',play_tx, flags=re.IGNORECASE) | |
# create a new keyword | |
play_tx = re.sub(' \#',' Jersey_Id ',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub(' min\)',' minutes ',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('ft.','feet',play_tx, flags=re.IGNORECASE) | |
# redundand keywords | |
play_tx = re.sub('- double minor','',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('(maj)','',play_tx, flags=re.IGNORECASE) | |
#play_tx = re.sub('-bench','',play_tx, flags=re.IGNORECASE) # not sure if I want to do this here or below | |
#play_tx = re.sub('- bench','',play_tx, flags=re.IGNORECASE) # not sure if I want to do this here or below | |
# keywords with multiple words will be collapsed into one word | |
play_tx = re.sub('BLOCKED BY','BLOCKED_BY',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('CLOCK PROBLEM','CLOCK_PROBLEM',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Early Intermission','Early_Intermission',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('HAND PASS','HAND_PASS',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('HIT CROSSBAR','HIT_CROSSBAR',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('GOAL INTERFERENCE','GOAL_INTERFERENCE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('GOAL OVERTURNED','GOAL_OVERTURNED',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('GOALIE STOPPED','GOALIE_STOPPED',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('HOME TIMEOUT','HOME_TIMEOUT',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('ICE PROBLEM','ICE_PROBLEM',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('NET OFF','NET_OFF',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('OBJECTS ON ICE','OBJECTS_ONICE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('OFFICIAL INJURY','OFFICIAL_INJURY',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('OFF-SIDE','OFFSIDE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('OVER NET','OVER_NET',play_tx, flags=re.IGNORECASE) | |
# play_tx = re.sub('Over Net','OVER_NET',play_tx, flags=re.IGNORECASE) # contains special character | |
play_tx = re.sub('PENALTY SHOT','PENALTY_SHOT',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PLAYER EQUIPMENT','PLAYER_EQUIPMENT',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PLAYER INJURY','PLAYER_INJURY',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PUCK FROZEN','PUCK_FROZEN',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PUCK IN BENCHES','PUCK_INBENCHES',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PUCK IN CROWD','PUCK_INCROWD',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PUCK IN NETTING','PUCK_INNETTING',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('REFEREE OR LINESMAN','REFEREE_LINESMAN',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('RINK REPAIR','RINK_REPAIR',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('SHOOTOUT COMPLETED','SHOOTOUT_COMPLETED',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('TV TIMEOUT','TV_TIMEOUT',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('VIDEO REVIEW','VIDEO_REVIEW',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('VISITOR TIMEOUT','VISITOR_TIMEOUT',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Wide Of Net','WIDE_OFNET',play_tx, flags=re.IGNORECASE) | |
# play_tx = re.sub('Wide of Net','WIDE_OFNET',play_tx, flags=re.IGNORECASE) # contains special character | |
# penalties | |
play_tx = re.sub('ABUSE OF OFFICIALS - BENCH','ABUSE_OFFICIALS',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('ABUSE OF OFFICIALS','ABUSE_OFFICIALS',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('ABUSIVE LANGUAGE - BENCH','ABUSIVE_LANGUAGE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('ABUSIVE LANGUAGE','ABUSIVE_LANGUAGE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('BROKEN STICK','BROKEN_STICK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Checking from behind','CHECKING_FROMBEHIND',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Concealing Puck','CONCEALING_PUCK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Closing Hand On Puck','CLOSINGHAND_ONPUCK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('CROSS CHECKING','CROSS_CHECK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('CROSS CHECK','CROSS_CHECK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Delay Gm - Face-off Violation','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Delay of game - bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Delay of game-bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('DELAY OF GAME','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
# play_tx = re.sub('Delay of game','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) # contains special character | |
play_tx = re.sub('DELAYING GAME-ILL.PLAY GOALIE','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Delaying Game-Ill. play goalie','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Delaying Game-Puck over glass','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Delaying Game-Smothering puck','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Delaying Game','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('DELAYING THE GAME','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Face-off violation-bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Face-off violation','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('DRAWN BY','DRAWN_BY',play_tx, flags=re.IGNORECASE) | |
# play_tx = re.sub('Drawn By','DRAWN_BY',play_tx, flags=re.IGNORECASE) # contains special character | |
play_tx = re.sub('GAME MISCONDUCT','GAME_MISCONDUCT',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Goalie leave crease','GOALIE_CREASE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('HI-STICKING','HIGH_STICK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('HI STICK','HIGH_STICK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('HIGH STICK','HIGH_STICK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('HOLDING THE STICK','HOLDING_STICK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('ILLEGAL STICK','ILLEGAL_STICK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Illegal check to head','CHECK_HEAD',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Instigator - face shield','INSTIGATOR_FACESHIELD',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Instigator - Misconduct','INSTIGATOR_MISCONDUCT',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Interference - Goalkeeper','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Interference on goalkeeper','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Interference on goalie','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Leaving penalty box - bench','TOOMANY_MEN',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Match Penalty','MATCH_PENALTY',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PS-Covering puck in crease','PENALTYSHOT_COVERPUCK',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PS-Hooking on breakaway','PENALTYSHOT_HOOKING',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PS-Slash on breakaway','PENALTYSHOT_SLASH',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PS-Thow object at puck','PENALTYSHOT_THROWOBJECT',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PS-Throw object at puck','PENALTYSHOT_THROWOBJECT',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PS-Tripping on breakaway','PENALTYSHOT_TRIPPING',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Served By: ','SERVED_BY_',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('PREMATURE SUBSTITUTION','PREMATURE_SUBSTITUTION',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('Too many men/ice - bench','TOOMANY_MEN',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('UNSPORTSMANLIKE CONDUCT','UNSPORTSMANLIKE_CONDUCT',play_tx, flags=re.IGNORECASE) | |
# challenges | |
play_tx = re.sub('CHLG HM','CHALLENGE_HOME',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('CHLG LEAGUE','CHALLENGE_LEAGUE',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('CHLG VIS','CHALLENGE_AWAY',play_tx, flags=re.IGNORECASE) | |
# Prepare delimiter of space | |
play_tx = re.sub(' - ',' ',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('- ',' ',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub(', ',' ',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub(',',' ',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub(': ',' ',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('; ',' ',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub(':',' ',play_tx, flags=re.IGNORECASE) # used for HH:MM | |
play_tx = re.sub('\(',' ',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub('\)','',play_tx, flags=re.IGNORECASE) | |
play_tx = re.sub(' ',' ',play_tx, flags=re.IGNORECASE) # hidden character | |
play_tx = re.sub(' +',' ',play_tx, flags=re.IGNORECASE) # turn multiple spaces into one | |
return re.split(' ', play_tx) | |
def process_play(event_cd, parsed_play_tx, period_cd, parsed_assist_tx): | |
# all these are optional fields | |
play_element_ct = len(parsed_play_tx) | |
team_id = '' | |
jersey_id = '' | |
player_tx = '' | |
assist_jersey_id = '' | |
assist_player_tx = '' | |
assist2_jersey_id = '' | |
assist2_player_tx = '' | |
mate_jersey_id = '' | |
mate_player_tx = '' | |
opp_team_id = '' | |
opp_jersey_id = '' | |
opp_player_tx = '' | |
away_team_id = '' | |
away_jersey_id = '' | |
away_player_tx = '' | |
home_team_id = '' | |
home_jersey_id = '' | |
home_player_tx = '' | |
winner_team_id = '' | |
zone_cd = '' | |
shot_cd = '' | |
outcome_shot_cd = '' | |
distance_ct = '' | |
penalty_cd = '' | |
minutes_penalty_ct = '' | |
minutes2_penalty_ct = '' | |
reason_cd = '' | |
reason2_cd = '' | |
reason3_cd = '' | |
result_cd = '' | |
hours_tm = '' | |
minutes_tm = '' | |
timezone_cd = '' | |
timezone_tx = '' | |
season_goal_ct = '' | |
season_assist_ct = '' | |
season_assist2_ct = '' | |
if event_cd == 'BLOCK': | |
full_event_cd = 'SHOT_BLOCKED' | |
team_id = parsed_play_tx[0] | |
jersey_id = parsed_play_tx[2] | |
player_tx = parsed_play_tx[3] | |
opp_team_id = parsed_play_tx[5] | |
opp_jersey_id = parsed_play_tx[7] | |
opp_player_tx = parsed_play_tx[8] | |
if parsed_play_tx[9].find('_Zone') >= 0: | |
shot_cd = '' | |
zone_cd = parsed_play_tx[9] | |
else: | |
shot_cd = parsed_play_tx[9] | |
zone_cd = parsed_play_tx[10] | |
elif event_cd == 'CHL': | |
full_event_cd = 'CHALLENGE' | |
if len(parsed_play_tx)>=2: | |
team_id = parsed_play_tx[0] | |
reason_cd = parsed_play_tx[2] | |
result_cd = parsed_play_tx[4] | |
elif event_cd == 'EIEND': | |
full_event_cd = 'EARLY INTERMISSION ENDED' | |
timezone_tx = parsed_play_tx[1] | |
hours_tm = parsed_play_tx[2] | |
minutes_tm = parsed_play_tx[3] | |
timezone_cd = parsed_play_tx[4] | |
elif event_cd == 'EISTR': | |
full_event_cd = 'EARLY INTERMISSION STARTED' | |
timezone_tx = parsed_play_tx[1] | |
hours_tm = parsed_play_tx[2] | |
minutes_tm = parsed_play_tx[3] | |
timezone_cd = parsed_play_tx[4] | |
elif event_cd == 'FAC': | |
full_event_cd = 'FACEOFF' | |
winner_team_id = parsed_play_tx[0] | |
zone_cd = parsed_play_tx[2] | |
away_team_id = parsed_play_tx[3] | |
away_jersey_id = parsed_play_tx[5] | |
away_player_tx = parsed_play_tx[6] | |
home_team_id = parsed_play_tx[8] | |
home_jersey_id = parsed_play_tx[10] | |
home_player_tx = parsed_play_tx[11] | |
elif event_cd == 'GEND': | |
full_event_cd = 'GAME ENDED' | |
timezone_tx = parsed_play_tx[1] | |
hours_tm = parsed_play_tx[2] | |
minutes_tm = parsed_play_tx[3] | |
timezone_cd = parsed_play_tx[4] | |
elif event_cd == 'GIVE': | |
full_event_cd = 'GIVEAWAY' | |
team_id = parsed_play_tx[0] | |
jersey_id = parsed_play_tx[3] | |
player_tx = parsed_play_tx[4] | |
zone_cd = parsed_play_tx[5] | |
elif event_cd == 'GOAL': | |
full_event_cd = 'SHOT_GOAL' | |
team_id = parsed_play_tx[0] | |
jersey_id = parsed_play_tx[2] | |
player_tx = parsed_play_tx[3] | |
if subseason_id == '02' and period_cd == 5: # shootout | |
if parsed_play_tx[4].find('_Zone') >= 0: | |
shot_cd = '' | |
zone_cd = parsed_play_tx[4] | |
distance_ct = parsed_play_tx[5] | |
else: | |
shot_cd = parsed_play_tx[4] | |
zone_cd = parsed_play_tx[5] | |
distance_ct = parsed_play_tx[6] | |
else: | |
season_goal_ct = parsed_play_tx[4] | |
if parsed_play_tx[5].find('PENALTY_SHOT') >= 0: # penalty shot | |
shot_cd = parsed_play_tx[6] | |
zone_cd = parsed_play_tx[5] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone" | |
distance_ct = parsed_play_tx[8] | |
elif parsed_play_tx[5].find('_Zone') >= 0: # missing shot | |
shot_cd = '' | |
zone_cd = parsed_play_tx[5] | |
distance_ct = parsed_play_tx[6] | |
else: | |
shot_cd = parsed_play_tx[5] | |
zone_cd = parsed_play_tx[6] | |
distance_ct = parsed_play_tx[7] | |
# assist | |
for i in range(len(parsed_assist_tx)): | |
if parsed_assist_tx[i].find('Jersey_Id') >= 0: | |
if assist_jersey_id == '': | |
assist_jersey_id = parsed_assist_tx[i+1] | |
assist_player_tx = parsed_assist_tx[i+2] | |
season_assist_ct = parsed_assist_tx[i+3] | |
else: # 2nd assist | |
assist2_jersey_id = parsed_assist_tx[i+1] | |
assist2_player_tx = parsed_assist_tx[i+2] | |
season_assist2_ct = parsed_assist_tx[i+3] | |
elif event_cd == 'GOFF': | |
full_event_cd = 'GAME OFF' | |
elif event_cd == 'HIT': | |
full_event_cd = 'HIT' | |
team_id = parsed_play_tx[0] | |
jersey_id = parsed_play_tx[2] | |
player_tx = parsed_play_tx[3] | |
opp_team_id = parsed_play_tx[5] | |
opp_jersey_id = parsed_play_tx[7] | |
opp_player_tx = parsed_play_tx[8] | |
zone_cd = parsed_play_tx[9] | |
elif event_cd == 'MISS': | |
full_event_cd = 'SHOT_MISSEDNET' | |
team_id = parsed_play_tx[0] | |
jersey_id = parsed_play_tx[2] | |
player_tx = parsed_play_tx[3] | |
if subseason_id == '02' and period_cd == 5: # shootout | |
if parsed_play_tx[5].find('_Zone') >= 0: | |
shot_cd = '' | |
outcome_shot_cd = parsed_play_tx[4] | |
zone_cd = parsed_play_tx[5] | |
distance_ct = parsed_play_tx[6] | |
else: | |
shot_cd = parsed_play_tx[4] | |
outcome_shot_cd = parsed_play_tx[5] | |
zone_cd = parsed_play_tx[6] | |
distance_ct = parsed_play_tx[7] | |
else: | |
if parsed_play_tx[4].find('PENALTY_SHOT') >= 0: | |
shot_cd = parsed_play_tx[5] | |
outcome_shot_cd = parsed_play_tx[6] | |
zone_cd = parsed_play_tx[4] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone" | |
distance_ct = parsed_play_tx[8] | |
elif parsed_play_tx[5].find('_Zone') >= 0: | |
shot_cd = '' | |
outcome_shot_cd = parsed_play_tx[4] | |
zone_cd = parsed_play_tx[5] | |
distance_ct = parsed_play_tx[6] | |
else: | |
shot_cd = parsed_play_tx[4] | |
outcome_shot_cd = parsed_play_tx[5] | |
zone_cd = parsed_play_tx[6] | |
distance_ct = parsed_play_tx[7] | |
elif event_cd == 'PEND': | |
full_event_cd = 'PERIOD ENDED' | |
timezone_tx = parsed_play_tx[1] | |
hours_tm = parsed_play_tx[2] | |
minutes_tm = parsed_play_tx[3] | |
timezone_cd = parsed_play_tx[4] | |
elif event_cd == 'PENL': | |
full_event_cd = 'PENALTY' | |
team_id = parsed_play_tx[0] | |
# there are two main sections (1) team penalty, (2) regular penalty | |
if parsed_play_tx[1].find('TEAM') >= 0: # team penalty | |
jersey_id = '0' | |
player_tx = parsed_play_tx[1] | |
penalty_cd = parsed_play_tx[2] | |
else: | |
jersey_id = parsed_play_tx[2] | |
player_tx = parsed_play_tx[3] | |
penalty_cd = parsed_play_tx[4] | |
# there are four subsections (1) minutes, (2) zone, (3) served by, (4) drawn by | |
for ss in parsed_play_tx: | |
if ss.find('_Zone') >= 0: | |
zone_cd = ss | |
for i in range(len(parsed_play_tx)): | |
if parsed_play_tx[i].find('minutes') >= 0: | |
if minutes_penalty_ct == '': | |
minutes_penalty_ct = parsed_play_tx[i-1] | |
else: # 2nd penalty for misconduct | |
minutes2_penalty_ct = parsed_play_tx[i-1] | |
if parsed_play_tx[i].find('SERVED_BY') >= 0: | |
mate_jersey_id = parsed_play_tx[i+1] | |
mate_player_tx = parsed_play_tx[i+2] | |
if parsed_play_tx[i].find('DRAWN_BY') >= 0: | |
opp_team_id = parsed_play_tx[i+1] | |
opp_jersey_id = parsed_play_tx[i+3] | |
opp_player_tx = parsed_play_tx[i+4] | |
elif event_cd == 'PSTR': | |
full_event_cd = 'PERIOD STARTED' | |
timezone_tx = parsed_play_tx[1] | |
hours_tm = parsed_play_tx[2] | |
minutes_tm = parsed_play_tx[3] | |
timezone_cd = parsed_play_tx[4] | |
elif event_cd == 'SHOT': | |
full_event_cd = 'SHOT_ONGOAL' | |
team_id = parsed_play_tx[0] | |
outcome_shot_cd = parsed_play_tx[1] | |
jersey_id = parsed_play_tx[3] | |
player_tx = parsed_play_tx[4] | |
if parsed_play_tx[5].find('PENALTY_SHOT') >= 0: | |
shot_cd = parsed_play_tx[6] | |
zone_cd = parsed_play_tx[5] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone" | |
distance_ct = parsed_play_tx[8] | |
else: | |
shot_cd = parsed_play_tx[5] | |
zone_cd = parsed_play_tx[6] | |
distance_ct = parsed_play_tx[7] | |
elif event_cd == 'SOC': | |
full_event_cd = 'SHOOTOUT COMPLETED' | |
timezone_tx = parsed_play_tx[1] | |
hours_tm = parsed_play_tx[2] | |
minutes_tm = parsed_play_tx[3] | |
timezone_cd = parsed_play_tx[4] | |
elif event_cd == 'STOP': | |
full_event_cd = 'STOPPAGE IN PLAY' | |
if len(parsed_play_tx) > 0: reason_cd = parsed_play_tx[0] | |
if len(parsed_play_tx) > 1: reason2_cd = parsed_play_tx[1] | |
if len(parsed_play_tx) > 2: reason3_cd = parsed_play_tx[2] | |
elif event_cd == 'TAKE': | |
full_event_cd = 'TAKEAWAY' | |
team_id = parsed_play_tx[0] | |
jersey_id = parsed_play_tx[3] | |
player_tx = parsed_play_tx[4] | |
zone_cd = parsed_play_tx[5] | |
else: | |
full_event_cd = 'UNKNOWN' | |
# cleanup player names | |
player_tx \ | |
= re.sub('_',' ',player_tx, flags=re.IGNORECASE) | |
assist_player_tx \ | |
= re.sub('_',' ',assist_player_tx, flags=re.IGNORECASE) | |
assist2_player_tx \ | |
= re.sub('_',' ',assist2_player_tx, flags=re.IGNORECASE) | |
mate_player_tx \ | |
= re.sub('_',' ',mate_player_tx, flags=re.IGNORECASE) | |
opp_player_tx \ | |
= re.sub('_',' ',opp_player_tx, flags=re.IGNORECASE) | |
away_player_tx \ | |
= re.sub('_',' ',away_player_tx, flags=re.IGNORECASE) | |
home_player_tx \ | |
= re.sub('_',' ',home_player_tx, flags=re.IGNORECASE) | |
outfile.write('|{fulleventcd}' | |
'|{playelementct}' | |
'|{teamid}' | |
'|{jerseyid}' | |
'|{playertx}' | |
'|{assistjerseyid}' | |
'|{assistplayertx}' | |
'|{assist2jerseyid}' | |
'|{assist2playertx}' | |
'|{matejerseyid}' | |
'|{mateplayertx}' | |
'|{oppteamid}' | |
'|{oppjerseyid}' | |
'|{oppplayertx}' | |
'|{awayteamid}' | |
'|{awayjerseyid}' | |
'|{awayplayertx}' | |
'|{hometeamid}' | |
'|{homejerseyid}' | |
'|{homeplayertx}' | |
'|{winnerteamid}' | |
'|{zonecd}' | |
'|{shotcd}' | |
'|{outcomeshotcd}' | |
'|{distancect}' | |
'|{penaltycd}' | |
'|{minutespenaltyct}' | |
'|{minutes2penaltyct}' | |
'|{reasoncd}' | |
'|{reason2cd}' | |
'|{reason3cd}' | |
'|{resultcd}' | |
'|{hourstm}' | |
'|{minutestm}' | |
'|{timezonecd}' | |
'|{timezonetx}' | |
'|{seasongoalct}' | |
'|{seasonassistct}' | |
'|{seasonassist2ct}'.format( | |
fulleventcd=full_event_cd | |
, playelementct=play_element_ct | |
, teamid=team_id | |
, jerseyid=jersey_id | |
, playertx=player_tx | |
, assistjerseyid=assist_jersey_id | |
, assistplayertx=assist_player_tx | |
, assist2jerseyid=assist2_jersey_id | |
, assist2playertx=assist2_player_tx | |
, matejerseyid=mate_jersey_id | |
, mateplayertx=mate_player_tx | |
, oppteamid=opp_team_id | |
, oppjerseyid=opp_jersey_id | |
, oppplayertx=opp_player_tx | |
, awayteamid=away_team_id | |
, awayjerseyid=away_jersey_id | |
, awayplayertx=away_player_tx | |
, hometeamid=home_team_id | |
, homejerseyid=home_jersey_id | |
, homeplayertx=home_player_tx | |
, winnerteamid=winner_team_id | |
, zonecd=zone_cd | |
, shotcd=shot_cd | |
, outcomeshotcd=outcome_shot_cd | |
, distancect=distance_ct | |
, penaltycd=penalty_cd | |
, minutespenaltyct=minutes_penalty_ct | |
, minutes2penaltyct=minutes2_penalty_ct | |
, reasoncd=reason_cd | |
, reason2cd=reason2_cd | |
, reason3cd=reason3_cd | |
, resultcd=result_cd | |
, hourstm=hours_tm | |
, minutestm=minutes_tm | |
, timezonecd=timezone_cd | |
, timezonetx=timezone_tx | |
, seasongoalct=season_goal_ct | |
, seasonassistct=season_assist_ct | |
, seasonassist2ct=season_assist2_ct | |
)) | |
outfile.write('\n') | |
mergedfile.write('|{fulleventcd}' | |
'|{playelementct}' | |
'|{teamid}' | |
'|{jerseyid}' | |
'|{playertx}' | |
'|{assistjerseyid}' | |
'|{assistplayertx}' | |
'|{assist2jerseyid}' | |
'|{assist2playertx}' | |
'|{matejerseyid}' | |
'|{mateplayertx}' | |
'|{oppteamid}' | |
'|{oppjerseyid}' | |
'|{oppplayertx}' | |
'|{awayteamid}' | |
'|{awayjerseyid}' | |
'|{awayplayertx}' | |
'|{hometeamid}' | |
'|{homejerseyid}' | |
'|{homeplayertx}' | |
'|{winnerteamid}' | |
'|{zonecd}' | |
'|{shotcd}' | |
'|{outcomeshotcd}' | |
'|{distancect}' | |
'|{penaltycd}' | |
'|{minutespenaltyct}' | |
'|{minutes2penaltyct}' | |
'|{reasoncd}' | |
'|{reason2cd}' | |
'|{reason3cd}' | |
'|{resultcd}' | |
'|{hourstm}' | |
'|{minutestm}' | |
'|{timezonecd}' | |
'|{timezonetx}' | |
'|{seasongoalct}' | |
'|{seasonassistct}' | |
'|{seasonassist2ct}'.format( | |
fulleventcd=full_event_cd | |
, playelementct=play_element_ct | |
, teamid=team_id | |
, jerseyid=jersey_id | |
, playertx=player_tx | |
, assistjerseyid=assist_jersey_id | |
, assistplayertx=assist_player_tx | |
, assist2jerseyid=assist2_jersey_id | |
, assist2playertx=assist2_player_tx | |
, matejerseyid=mate_jersey_id | |
, mateplayertx=mate_player_tx | |
, oppteamid=opp_team_id | |
, oppjerseyid=opp_jersey_id | |
, oppplayertx=opp_player_tx | |
, awayteamid=away_team_id | |
, awayjerseyid=away_jersey_id | |
, awayplayertx=away_player_tx | |
, hometeamid=home_team_id | |
, homejerseyid=home_jersey_id | |
, homeplayertx=home_player_tx | |
, winnerteamid=winner_team_id | |
, zonecd=zone_cd | |
, shotcd=shot_cd | |
, outcomeshotcd=outcome_shot_cd | |
, distancect=distance_ct | |
, penaltycd=penalty_cd | |
, minutespenaltyct=minutes_penalty_ct | |
, minutes2penaltyct=minutes2_penalty_ct | |
, reasoncd=reason_cd | |
, reason2cd=reason2_cd | |
, reason3cd=reason3_cd | |
, resultcd=result_cd | |
, hourstm=hours_tm | |
, minutestm=minutes_tm | |
, timezonecd=timezone_cd | |
, timezonetx=timezone_tx | |
, seasongoalct=season_goal_ct | |
, seasonassistct=season_assist_ct | |
, seasonassist2ct=season_assist2_ct | |
)) | |
mergedfile.write('\n') | |
merged_targetfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/merged_parsed_{d}.csv".format(d=datafile_id) | |
with open(merged_targetfile,'w') as mergedfile: | |
mergedfile.write(header_row_main + header_row_rest + '\n') # write out the header row | |
for int_game_id in range(first_game_id, last_game_id+1): | |
game_id = str(int_game_id).zfill(4) | |
print(game_id) | |
sourcefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id) | |
intermediatefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/expanded_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id) | |
penaltyfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/penalty_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id) | |
targetfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/parsed_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id) | |
with open(sourcefile,'r') as infile \ | |
, open(intermediatefile,'w') as intermfile \ | |
, open(penaltyfile,'w') as penaltyfile \ | |
, open(targetfile,'w') as outfile \ | |
, open(merged_targetfile,'a') as mergedfile: # append to file | |
soup = BeautifulSoup(infile, "lxml") | |
tableRow = soup.findAll('tr')[1:] # infile: skip the first row, which is a header row | |
outfile.write(header_row_main + header_row_rest + '\n') # outfile: write out the header row | |
for r in tableRow: | |
tableRowData = r.findAll('td') | |
event_id = tableRowData[0].getText() | |
period_cd = int(tableRowData[1].getText()) | |
strength_cd = tableRowData[2].getText() | |
split_play_tm = tableRowData[3].getText().split(':') | |
play_tm = int(split_play_tm[0])*60 + int(split_play_tm[1]) | |
split_remain_tm = tableRowData[5].getText().split(':') | |
remain_tm = int(split_remain_tm[0])*60 + int(split_remain_tm[1]) | |
if period_cd < 5: # not a shootout | |
game_tm = (period_cd - 1) * 1200 + play_tm | |
event_cd = tableRowData[6].getText() | |
play_tx = tableRowData[7].getText() | |
parsed_play = parse_play(play_tx) | |
assist_tx = '' | |
if event_cd == 'GOAL' and len(tableRowData) >= 10: | |
assist_tx = tableRowData[9].getText() | |
parsed_assist = parse_play(assist_tx) | |
outfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format( | |
season=season_id | |
, subseason=subseason_id | |
, game=game_id | |
, eventid=event_id | |
, periodcd=period_cd | |
, strengthcd=strength_cd | |
, gametm=game_tm | |
, playtm=play_tm | |
, remaintm=remain_tm | |
, eventcd=event_cd | |
)) | |
# ========== start: helpful for debugging ============= | |
mergedfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format( | |
season=season_id | |
, subseason=subseason_id | |
, game=game_id | |
, eventid=event_id | |
, periodcd=period_cd | |
, strengthcd=strength_cd | |
, gametm=game_tm | |
, playtm=play_tm | |
, remaintm=remain_tm | |
, eventcd=event_cd | |
)) | |
intermfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format( | |
season=season_id | |
, subseason=subseason_id | |
, game=game_id | |
, eventid=event_id | |
, periodcd=period_cd | |
, strengthcd=strength_cd | |
, gametm=game_tm | |
, playtm=play_tm | |
, remaintm=remain_tm | |
, eventcd=event_cd | |
)) | |
if event_cd == 'PENL': | |
penaltyfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format( | |
season=season_id | |
, subseason=subseason_id | |
, game=game_id | |
, eventid=event_id | |
, periodcd=period_cd | |
, strengthcd=strength_cd | |
, gametm=game_tm | |
, playtm=play_tm | |
, remaintm=remain_tm | |
, eventcd=event_cd | |
)) | |
for p in parsed_play: | |
intermfile.write('|' + str(p)) | |
if event_cd == 'PENL': | |
penaltyfile.write('|' + str(p)) | |
intermfile.write('\n') | |
if event_cd == 'PENL': | |
penaltyfile.write('\n') | |
# ========== end: helpful for debugging ============= | |
process_play(event_cd, parsed_play, period_cd, parsed_assist) | |
print("Parse end") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment