Skip to content

Instantly share code, notes, and snippets.

@tangotiger
Last active February 7, 2016 21:28
Show Gist options
  • Save tangotiger/ad3fcd370dcda1d39496 to your computer and use it in GitHub Desktop.
Save tangotiger/ad3fcd370dcda1d39496 to your computer and use it in GitHub Desktop.
Parse the Play by Play file
from bs4 import BeautifulSoup
import re
print("Parse start")
first_game_id = 1 #+665
last_game_id = 668 #-2
season_id = '20152016'
subseason_id = '02'
datafile_id = 'PL'
header_row_main = (
'SEASON_ID'
'|SUBSEASON_ID'
'|GAME_ID'
'|EVENT_ID'
'|PERIOD_CD'
'|STRENGTH_CD'
'|GAME_TM'
'|PLAY_TM'
'|REMAIN_TM'
'|EVENT_CD'
)
header_row_rest = (
'|FULL_EVENT_CD'
'|PLAY_ELEMENT_CT'
'|TEAM_ID'
'|JERSEY_ID'
'|PLAYER_TX'
'|ASSIST_JERSEY_ID'
'|ASSIST_PLAYER_TX'
'|ASSIST2_JERSEY_ID'
'|ASSIST2_PLAYER_TX'
'|MATE_JERSEY_ID'
'|MATE_PLAYER_TX'
'|OPP_TEAM_ID'
'|OPP_JERSEY_ID'
'|OPP_PLAYER_TX'
'|AWAY_TEAM_ID'
'|AWAY_JERSEY_ID'
'|AWAY_PLAYER_TX'
'|HOME_TEAM_ID'
'|HOME_JERSEY_ID'
'|HOME_PLAYER_TX'
'|WINNER_TEAM_ID'
'|ZONE_CD'
'|SHOT_CD'
'|OUTCOME_SHOT_CD'
'|DISTANCE_CT'
'|PENALTY_CD'
'|MINUTES_PENALTY_CT'
'|MINUTES2_PENALTY_CT'
'|REASON_CD'
'|REASON2_CD'
'|REASON3_CD'
'|RESULT_CD'
'|HOURS_TM'
'|MINUTES_TM'
'|TIMEZONE_CD'
'|TIMEZONE_TX'
'|SEASON_GOAL_CT'
'|SEASON_ASSIST_CT'
'|SEASON_ASSIST2_CT'
)
def parse_play(play_tx):
# players with multi names collapsed into one name, temporarily
play_tx = re.sub('DI GIUSEPPE','DI_GIUSEPPE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('DE HAAN','DE_HAAN',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('DE LA ROSE','DE_LA_ROSE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('VAN RIEMSDYK','VAN_RIEMSDYK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('DEL ZOTTO','DEL_ZOTTO',play_tx, flags=re.IGNORECASE)
# special keywords to merge with prior words
play_tx = re.sub('. Zone','_Zone',play_tx, flags=re.IGNORECASE)
play_tx = re.sub(' Start-','_Start',play_tx, flags=re.IGNORECASE)
play_tx = re.sub(' End-','_End',play_tx, flags=re.IGNORECASE)
play_tx = re.sub(' time:','_Time',play_tx, flags=re.IGNORECASE)
# create a new keyword
play_tx = re.sub(' \#',' Jersey_Id ',play_tx, flags=re.IGNORECASE)
play_tx = re.sub(' min\)',' minutes ',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('ft.','feet',play_tx, flags=re.IGNORECASE)
# redundand keywords
play_tx = re.sub('- double minor','',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('(maj)','',play_tx, flags=re.IGNORECASE)
#play_tx = re.sub('-bench','',play_tx, flags=re.IGNORECASE) # not sure if I want to do this here or below
#play_tx = re.sub('- bench','',play_tx, flags=re.IGNORECASE) # not sure if I want to do this here or below
# keywords with multiple words will be collapsed into one word
play_tx = re.sub('BLOCKED BY','BLOCKED_BY',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('CLOCK PROBLEM','CLOCK_PROBLEM',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Early Intermission','Early_Intermission',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('HAND PASS','HAND_PASS',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('HIT CROSSBAR','HIT_CROSSBAR',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('GOAL INTERFERENCE','GOAL_INTERFERENCE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('GOAL OVERTURNED','GOAL_OVERTURNED',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('GOALIE STOPPED','GOALIE_STOPPED',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('HOME TIMEOUT','HOME_TIMEOUT',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('ICE PROBLEM','ICE_PROBLEM',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('NET OFF','NET_OFF',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('OBJECTS ON ICE','OBJECTS_ONICE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('OFFICIAL INJURY','OFFICIAL_INJURY',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('OFF-SIDE','OFFSIDE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('OVER NET','OVER_NET',play_tx, flags=re.IGNORECASE)
# play_tx = re.sub('Over Net','OVER_NET',play_tx, flags=re.IGNORECASE) # contains special character
play_tx = re.sub('PENALTY SHOT','PENALTY_SHOT',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PLAYER EQUIPMENT','PLAYER_EQUIPMENT',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PLAYER INJURY','PLAYER_INJURY',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PUCK FROZEN','PUCK_FROZEN',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PUCK IN BENCHES','PUCK_INBENCHES',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PUCK IN CROWD','PUCK_INCROWD',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PUCK IN NETTING','PUCK_INNETTING',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('REFEREE OR LINESMAN','REFEREE_LINESMAN',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('RINK REPAIR','RINK_REPAIR',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('SHOOTOUT COMPLETED','SHOOTOUT_COMPLETED',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('TV TIMEOUT','TV_TIMEOUT',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('VIDEO REVIEW','VIDEO_REVIEW',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('VISITOR TIMEOUT','VISITOR_TIMEOUT',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Wide Of Net','WIDE_OFNET',play_tx, flags=re.IGNORECASE)
# play_tx = re.sub('Wide of Net','WIDE_OFNET',play_tx, flags=re.IGNORECASE) # contains special character
# penalties
play_tx = re.sub('ABUSE OF OFFICIALS - BENCH','ABUSE_OFFICIALS',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('ABUSE OF OFFICIALS','ABUSE_OFFICIALS',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('ABUSIVE LANGUAGE - BENCH','ABUSIVE_LANGUAGE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('ABUSIVE LANGUAGE','ABUSIVE_LANGUAGE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('BROKEN STICK','BROKEN_STICK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Checking from behind','CHECKING_FROMBEHIND',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Concealing Puck','CONCEALING_PUCK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Closing Hand On Puck','CLOSINGHAND_ONPUCK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('CROSS CHECKING','CROSS_CHECK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('CROSS CHECK','CROSS_CHECK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Delay Gm - Face-off Violation','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Delay of game - bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Delay of game-bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('DELAY OF GAME','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
# play_tx = re.sub('Delay of game','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) # contains special character
play_tx = re.sub('DELAYING GAME-ILL.PLAY GOALIE','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Delaying Game-Ill. play goalie','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Delaying Game-Puck over glass','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Delaying Game-Smothering puck','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Delaying Game','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('DELAYING THE GAME','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Face-off violation-bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Face-off violation','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('DRAWN BY','DRAWN_BY',play_tx, flags=re.IGNORECASE)
# play_tx = re.sub('Drawn By','DRAWN_BY',play_tx, flags=re.IGNORECASE) # contains special character
play_tx = re.sub('GAME MISCONDUCT','GAME_MISCONDUCT',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Goalie leave crease','GOALIE_CREASE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('HI-STICKING','HIGH_STICK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('HI STICK','HIGH_STICK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('HIGH STICK','HIGH_STICK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('HOLDING THE STICK','HOLDING_STICK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('ILLEGAL STICK','ILLEGAL_STICK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Illegal check to head','CHECK_HEAD',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Instigator - face shield','INSTIGATOR_FACESHIELD',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Instigator - Misconduct','INSTIGATOR_MISCONDUCT',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Interference - Goalkeeper','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Interference on goalkeeper','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Interference on goalie','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Leaving penalty box - bench','TOOMANY_MEN',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Match Penalty','MATCH_PENALTY',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PS-Covering puck in crease','PENALTYSHOT_COVERPUCK',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PS-Hooking on breakaway','PENALTYSHOT_HOOKING',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PS-Slash on breakaway','PENALTYSHOT_SLASH',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PS-Thow object at puck','PENALTYSHOT_THROWOBJECT',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PS-Throw object at puck','PENALTYSHOT_THROWOBJECT',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PS-Tripping on breakaway','PENALTYSHOT_TRIPPING',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Served By: ','SERVED_BY_',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('PREMATURE SUBSTITUTION','PREMATURE_SUBSTITUTION',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('Too many men/ice - bench','TOOMANY_MEN',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('UNSPORTSMANLIKE CONDUCT','UNSPORTSMANLIKE_CONDUCT',play_tx, flags=re.IGNORECASE)
# challenges
play_tx = re.sub('CHLG HM','CHALLENGE_HOME',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('CHLG LEAGUE','CHALLENGE_LEAGUE',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('CHLG VIS','CHALLENGE_AWAY',play_tx, flags=re.IGNORECASE)
# Prepare delimiter of space
play_tx = re.sub(' - ',' ',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('- ',' ',play_tx, flags=re.IGNORECASE)
play_tx = re.sub(', ',' ',play_tx, flags=re.IGNORECASE)
play_tx = re.sub(',',' ',play_tx, flags=re.IGNORECASE)
play_tx = re.sub(': ',' ',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('; ',' ',play_tx, flags=re.IGNORECASE)
play_tx = re.sub(':',' ',play_tx, flags=re.IGNORECASE) # used for HH:MM
play_tx = re.sub('\(',' ',play_tx, flags=re.IGNORECASE)
play_tx = re.sub('\)','',play_tx, flags=re.IGNORECASE)
play_tx = re.sub(' ',' ',play_tx, flags=re.IGNORECASE) # hidden character
play_tx = re.sub(' +',' ',play_tx, flags=re.IGNORECASE) # turn multiple spaces into one
return re.split(' ', play_tx)
def process_play(event_cd, parsed_play_tx, period_cd, parsed_assist_tx):
# all these are optional fields
play_element_ct = len(parsed_play_tx)
team_id = ''
jersey_id = ''
player_tx = ''
assist_jersey_id = ''
assist_player_tx = ''
assist2_jersey_id = ''
assist2_player_tx = ''
mate_jersey_id = ''
mate_player_tx = ''
opp_team_id = ''
opp_jersey_id = ''
opp_player_tx = ''
away_team_id = ''
away_jersey_id = ''
away_player_tx = ''
home_team_id = ''
home_jersey_id = ''
home_player_tx = ''
winner_team_id = ''
zone_cd = ''
shot_cd = ''
outcome_shot_cd = ''
distance_ct = ''
penalty_cd = ''
minutes_penalty_ct = ''
minutes2_penalty_ct = ''
reason_cd = ''
reason2_cd = ''
reason3_cd = ''
result_cd = ''
hours_tm = ''
minutes_tm = ''
timezone_cd = ''
timezone_tx = ''
season_goal_ct = ''
season_assist_ct = ''
season_assist2_ct = ''
if event_cd == 'BLOCK':
full_event_cd = 'SHOT_BLOCKED'
team_id = parsed_play_tx[0]
jersey_id = parsed_play_tx[2]
player_tx = parsed_play_tx[3]
opp_team_id = parsed_play_tx[5]
opp_jersey_id = parsed_play_tx[7]
opp_player_tx = parsed_play_tx[8]
if parsed_play_tx[9].find('_Zone') >= 0:
shot_cd = ''
zone_cd = parsed_play_tx[9]
else:
shot_cd = parsed_play_tx[9]
zone_cd = parsed_play_tx[10]
elif event_cd == 'CHL':
full_event_cd = 'CHALLENGE'
if len(parsed_play_tx)>=2:
team_id = parsed_play_tx[0]
reason_cd = parsed_play_tx[2]
result_cd = parsed_play_tx[4]
elif event_cd == 'EIEND':
full_event_cd = 'EARLY INTERMISSION ENDED'
timezone_tx = parsed_play_tx[1]
hours_tm = parsed_play_tx[2]
minutes_tm = parsed_play_tx[3]
timezone_cd = parsed_play_tx[4]
elif event_cd == 'EISTR':
full_event_cd = 'EARLY INTERMISSION STARTED'
timezone_tx = parsed_play_tx[1]
hours_tm = parsed_play_tx[2]
minutes_tm = parsed_play_tx[3]
timezone_cd = parsed_play_tx[4]
elif event_cd == 'FAC':
full_event_cd = 'FACEOFF'
winner_team_id = parsed_play_tx[0]
zone_cd = parsed_play_tx[2]
away_team_id = parsed_play_tx[3]
away_jersey_id = parsed_play_tx[5]
away_player_tx = parsed_play_tx[6]
home_team_id = parsed_play_tx[8]
home_jersey_id = parsed_play_tx[10]
home_player_tx = parsed_play_tx[11]
elif event_cd == 'GEND':
full_event_cd = 'GAME ENDED'
timezone_tx = parsed_play_tx[1]
hours_tm = parsed_play_tx[2]
minutes_tm = parsed_play_tx[3]
timezone_cd = parsed_play_tx[4]
elif event_cd == 'GIVE':
full_event_cd = 'GIVEAWAY'
team_id = parsed_play_tx[0]
jersey_id = parsed_play_tx[3]
player_tx = parsed_play_tx[4]
zone_cd = parsed_play_tx[5]
elif event_cd == 'GOAL':
full_event_cd = 'SHOT_GOAL'
team_id = parsed_play_tx[0]
jersey_id = parsed_play_tx[2]
player_tx = parsed_play_tx[3]
if subseason_id == '02' and period_cd == 5: # shootout
if parsed_play_tx[4].find('_Zone') >= 0:
shot_cd = ''
zone_cd = parsed_play_tx[4]
distance_ct = parsed_play_tx[5]
else:
shot_cd = parsed_play_tx[4]
zone_cd = parsed_play_tx[5]
distance_ct = parsed_play_tx[6]
else:
season_goal_ct = parsed_play_tx[4]
if parsed_play_tx[5].find('PENALTY_SHOT') >= 0: # penalty shot
shot_cd = parsed_play_tx[6]
zone_cd = parsed_play_tx[5] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone"
distance_ct = parsed_play_tx[8]
elif parsed_play_tx[5].find('_Zone') >= 0: # missing shot
shot_cd = ''
zone_cd = parsed_play_tx[5]
distance_ct = parsed_play_tx[6]
else:
shot_cd = parsed_play_tx[5]
zone_cd = parsed_play_tx[6]
distance_ct = parsed_play_tx[7]
# assist
for i in range(len(parsed_assist_tx)):
if parsed_assist_tx[i].find('Jersey_Id') >= 0:
if assist_jersey_id == '':
assist_jersey_id = parsed_assist_tx[i+1]
assist_player_tx = parsed_assist_tx[i+2]
season_assist_ct = parsed_assist_tx[i+3]
else: # 2nd assist
assist2_jersey_id = parsed_assist_tx[i+1]
assist2_player_tx = parsed_assist_tx[i+2]
season_assist2_ct = parsed_assist_tx[i+3]
elif event_cd == 'GOFF':
full_event_cd = 'GAME OFF'
elif event_cd == 'HIT':
full_event_cd = 'HIT'
team_id = parsed_play_tx[0]
jersey_id = parsed_play_tx[2]
player_tx = parsed_play_tx[3]
opp_team_id = parsed_play_tx[5]
opp_jersey_id = parsed_play_tx[7]
opp_player_tx = parsed_play_tx[8]
zone_cd = parsed_play_tx[9]
elif event_cd == 'MISS':
full_event_cd = 'SHOT_MISSEDNET'
team_id = parsed_play_tx[0]
jersey_id = parsed_play_tx[2]
player_tx = parsed_play_tx[3]
if subseason_id == '02' and period_cd == 5: # shootout
if parsed_play_tx[5].find('_Zone') >= 0:
shot_cd = ''
outcome_shot_cd = parsed_play_tx[4]
zone_cd = parsed_play_tx[5]
distance_ct = parsed_play_tx[6]
else:
shot_cd = parsed_play_tx[4]
outcome_shot_cd = parsed_play_tx[5]
zone_cd = parsed_play_tx[6]
distance_ct = parsed_play_tx[7]
else:
if parsed_play_tx[4].find('PENALTY_SHOT') >= 0:
shot_cd = parsed_play_tx[5]
outcome_shot_cd = parsed_play_tx[6]
zone_cd = parsed_play_tx[4] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone"
distance_ct = parsed_play_tx[8]
elif parsed_play_tx[5].find('_Zone') >= 0:
shot_cd = ''
outcome_shot_cd = parsed_play_tx[4]
zone_cd = parsed_play_tx[5]
distance_ct = parsed_play_tx[6]
else:
shot_cd = parsed_play_tx[4]
outcome_shot_cd = parsed_play_tx[5]
zone_cd = parsed_play_tx[6]
distance_ct = parsed_play_tx[7]
elif event_cd == 'PEND':
full_event_cd = 'PERIOD ENDED'
timezone_tx = parsed_play_tx[1]
hours_tm = parsed_play_tx[2]
minutes_tm = parsed_play_tx[3]
timezone_cd = parsed_play_tx[4]
elif event_cd == 'PENL':
full_event_cd = 'PENALTY'
team_id = parsed_play_tx[0]
# there are two main sections (1) team penalty, (2) regular penalty
if parsed_play_tx[1].find('TEAM') >= 0: # team penalty
jersey_id = '0'
player_tx = parsed_play_tx[1]
penalty_cd = parsed_play_tx[2]
else:
jersey_id = parsed_play_tx[2]
player_tx = parsed_play_tx[3]
penalty_cd = parsed_play_tx[4]
# there are four subsections (1) minutes, (2) zone, (3) served by, (4) drawn by
for ss in parsed_play_tx:
if ss.find('_Zone') >= 0:
zone_cd = ss
for i in range(len(parsed_play_tx)):
if parsed_play_tx[i].find('minutes') >= 0:
if minutes_penalty_ct == '':
minutes_penalty_ct = parsed_play_tx[i-1]
else: # 2nd penalty for misconduct
minutes2_penalty_ct = parsed_play_tx[i-1]
if parsed_play_tx[i].find('SERVED_BY') >= 0:
mate_jersey_id = parsed_play_tx[i+1]
mate_player_tx = parsed_play_tx[i+2]
if parsed_play_tx[i].find('DRAWN_BY') >= 0:
opp_team_id = parsed_play_tx[i+1]
opp_jersey_id = parsed_play_tx[i+3]
opp_player_tx = parsed_play_tx[i+4]
elif event_cd == 'PSTR':
full_event_cd = 'PERIOD STARTED'
timezone_tx = parsed_play_tx[1]
hours_tm = parsed_play_tx[2]
minutes_tm = parsed_play_tx[3]
timezone_cd = parsed_play_tx[4]
elif event_cd == 'SHOT':
full_event_cd = 'SHOT_ONGOAL'
team_id = parsed_play_tx[0]
outcome_shot_cd = parsed_play_tx[1]
jersey_id = parsed_play_tx[3]
player_tx = parsed_play_tx[4]
if parsed_play_tx[5].find('PENALTY_SHOT') >= 0:
shot_cd = parsed_play_tx[6]
zone_cd = parsed_play_tx[5] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone"
distance_ct = parsed_play_tx[8]
else:
shot_cd = parsed_play_tx[5]
zone_cd = parsed_play_tx[6]
distance_ct = parsed_play_tx[7]
elif event_cd == 'SOC':
full_event_cd = 'SHOOTOUT COMPLETED'
timezone_tx = parsed_play_tx[1]
hours_tm = parsed_play_tx[2]
minutes_tm = parsed_play_tx[3]
timezone_cd = parsed_play_tx[4]
elif event_cd == 'STOP':
full_event_cd = 'STOPPAGE IN PLAY'
if len(parsed_play_tx) > 0: reason_cd = parsed_play_tx[0]
if len(parsed_play_tx) > 1: reason2_cd = parsed_play_tx[1]
if len(parsed_play_tx) > 2: reason3_cd = parsed_play_tx[2]
elif event_cd == 'TAKE':
full_event_cd = 'TAKEAWAY'
team_id = parsed_play_tx[0]
jersey_id = parsed_play_tx[3]
player_tx = parsed_play_tx[4]
zone_cd = parsed_play_tx[5]
else:
full_event_cd = 'UNKNOWN'
# cleanup player names
player_tx \
= re.sub('_',' ',player_tx, flags=re.IGNORECASE)
assist_player_tx \
= re.sub('_',' ',assist_player_tx, flags=re.IGNORECASE)
assist2_player_tx \
= re.sub('_',' ',assist2_player_tx, flags=re.IGNORECASE)
mate_player_tx \
= re.sub('_',' ',mate_player_tx, flags=re.IGNORECASE)
opp_player_tx \
= re.sub('_',' ',opp_player_tx, flags=re.IGNORECASE)
away_player_tx \
= re.sub('_',' ',away_player_tx, flags=re.IGNORECASE)
home_player_tx \
= re.sub('_',' ',home_player_tx, flags=re.IGNORECASE)
outfile.write('|{fulleventcd}'
'|{playelementct}'
'|{teamid}'
'|{jerseyid}'
'|{playertx}'
'|{assistjerseyid}'
'|{assistplayertx}'
'|{assist2jerseyid}'
'|{assist2playertx}'
'|{matejerseyid}'
'|{mateplayertx}'
'|{oppteamid}'
'|{oppjerseyid}'
'|{oppplayertx}'
'|{awayteamid}'
'|{awayjerseyid}'
'|{awayplayertx}'
'|{hometeamid}'
'|{homejerseyid}'
'|{homeplayertx}'
'|{winnerteamid}'
'|{zonecd}'
'|{shotcd}'
'|{outcomeshotcd}'
'|{distancect}'
'|{penaltycd}'
'|{minutespenaltyct}'
'|{minutes2penaltyct}'
'|{reasoncd}'
'|{reason2cd}'
'|{reason3cd}'
'|{resultcd}'
'|{hourstm}'
'|{minutestm}'
'|{timezonecd}'
'|{timezonetx}'
'|{seasongoalct}'
'|{seasonassistct}'
'|{seasonassist2ct}'.format(
fulleventcd=full_event_cd
, playelementct=play_element_ct
, teamid=team_id
, jerseyid=jersey_id
, playertx=player_tx
, assistjerseyid=assist_jersey_id
, assistplayertx=assist_player_tx
, assist2jerseyid=assist2_jersey_id
, assist2playertx=assist2_player_tx
, matejerseyid=mate_jersey_id
, mateplayertx=mate_player_tx
, oppteamid=opp_team_id
, oppjerseyid=opp_jersey_id
, oppplayertx=opp_player_tx
, awayteamid=away_team_id
, awayjerseyid=away_jersey_id
, awayplayertx=away_player_tx
, hometeamid=home_team_id
, homejerseyid=home_jersey_id
, homeplayertx=home_player_tx
, winnerteamid=winner_team_id
, zonecd=zone_cd
, shotcd=shot_cd
, outcomeshotcd=outcome_shot_cd
, distancect=distance_ct
, penaltycd=penalty_cd
, minutespenaltyct=minutes_penalty_ct
, minutes2penaltyct=minutes2_penalty_ct
, reasoncd=reason_cd
, reason2cd=reason2_cd
, reason3cd=reason3_cd
, resultcd=result_cd
, hourstm=hours_tm
, minutestm=minutes_tm
, timezonecd=timezone_cd
, timezonetx=timezone_tx
, seasongoalct=season_goal_ct
, seasonassistct=season_assist_ct
, seasonassist2ct=season_assist2_ct
))
outfile.write('\n')
mergedfile.write('|{fulleventcd}'
'|{playelementct}'
'|{teamid}'
'|{jerseyid}'
'|{playertx}'
'|{assistjerseyid}'
'|{assistplayertx}'
'|{assist2jerseyid}'
'|{assist2playertx}'
'|{matejerseyid}'
'|{mateplayertx}'
'|{oppteamid}'
'|{oppjerseyid}'
'|{oppplayertx}'
'|{awayteamid}'
'|{awayjerseyid}'
'|{awayplayertx}'
'|{hometeamid}'
'|{homejerseyid}'
'|{homeplayertx}'
'|{winnerteamid}'
'|{zonecd}'
'|{shotcd}'
'|{outcomeshotcd}'
'|{distancect}'
'|{penaltycd}'
'|{minutespenaltyct}'
'|{minutes2penaltyct}'
'|{reasoncd}'
'|{reason2cd}'
'|{reason3cd}'
'|{resultcd}'
'|{hourstm}'
'|{minutestm}'
'|{timezonecd}'
'|{timezonetx}'
'|{seasongoalct}'
'|{seasonassistct}'
'|{seasonassist2ct}'.format(
fulleventcd=full_event_cd
, playelementct=play_element_ct
, teamid=team_id
, jerseyid=jersey_id
, playertx=player_tx
, assistjerseyid=assist_jersey_id
, assistplayertx=assist_player_tx
, assist2jerseyid=assist2_jersey_id
, assist2playertx=assist2_player_tx
, matejerseyid=mate_jersey_id
, mateplayertx=mate_player_tx
, oppteamid=opp_team_id
, oppjerseyid=opp_jersey_id
, oppplayertx=opp_player_tx
, awayteamid=away_team_id
, awayjerseyid=away_jersey_id
, awayplayertx=away_player_tx
, hometeamid=home_team_id
, homejerseyid=home_jersey_id
, homeplayertx=home_player_tx
, winnerteamid=winner_team_id
, zonecd=zone_cd
, shotcd=shot_cd
, outcomeshotcd=outcome_shot_cd
, distancect=distance_ct
, penaltycd=penalty_cd
, minutespenaltyct=minutes_penalty_ct
, minutes2penaltyct=minutes2_penalty_ct
, reasoncd=reason_cd
, reason2cd=reason2_cd
, reason3cd=reason3_cd
, resultcd=result_cd
, hourstm=hours_tm
, minutestm=minutes_tm
, timezonecd=timezone_cd
, timezonetx=timezone_tx
, seasongoalct=season_goal_ct
, seasonassistct=season_assist_ct
, seasonassist2ct=season_assist2_ct
))
mergedfile.write('\n')
merged_targetfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/merged_parsed_{d}.csv".format(d=datafile_id)
with open(merged_targetfile,'w') as mergedfile:
mergedfile.write(header_row_main + header_row_rest + '\n') # write out the header row
for int_game_id in range(first_game_id, last_game_id+1):
game_id = str(int_game_id).zfill(4)
print(game_id)
sourcefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
intermediatefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/expanded_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
penaltyfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/penalty_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
targetfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/parsed_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
with open(sourcefile,'r') as infile \
, open(intermediatefile,'w') as intermfile \
, open(penaltyfile,'w') as penaltyfile \
, open(targetfile,'w') as outfile \
, open(merged_targetfile,'a') as mergedfile: # append to file
soup = BeautifulSoup(infile, "lxml")
tableRow = soup.findAll('tr')[1:] # infile: skip the first row, which is a header row
outfile.write(header_row_main + header_row_rest + '\n') # outfile: write out the header row
for r in tableRow:
tableRowData = r.findAll('td')
event_id = tableRowData[0].getText()
period_cd = int(tableRowData[1].getText())
strength_cd = tableRowData[2].getText()
split_play_tm = tableRowData[3].getText().split(':')
play_tm = int(split_play_tm[0])*60 + int(split_play_tm[1])
split_remain_tm = tableRowData[5].getText().split(':')
remain_tm = int(split_remain_tm[0])*60 + int(split_remain_tm[1])
if period_cd < 5: # not a shootout
game_tm = (period_cd - 1) * 1200 + play_tm
event_cd = tableRowData[6].getText()
play_tx = tableRowData[7].getText()
parsed_play = parse_play(play_tx)
assist_tx = ''
if event_cd == 'GOAL' and len(tableRowData) >= 10:
assist_tx = tableRowData[9].getText()
parsed_assist = parse_play(assist_tx)
outfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
season=season_id
, subseason=subseason_id
, game=game_id
, eventid=event_id
, periodcd=period_cd
, strengthcd=strength_cd
, gametm=game_tm
, playtm=play_tm
, remaintm=remain_tm
, eventcd=event_cd
))
# ========== start: helpful for debugging =============
mergedfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
season=season_id
, subseason=subseason_id
, game=game_id
, eventid=event_id
, periodcd=period_cd
, strengthcd=strength_cd
, gametm=game_tm
, playtm=play_tm
, remaintm=remain_tm
, eventcd=event_cd
))
intermfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
season=season_id
, subseason=subseason_id
, game=game_id
, eventid=event_id
, periodcd=period_cd
, strengthcd=strength_cd
, gametm=game_tm
, playtm=play_tm
, remaintm=remain_tm
, eventcd=event_cd
))
if event_cd == 'PENL':
penaltyfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
season=season_id
, subseason=subseason_id
, game=game_id
, eventid=event_id
, periodcd=period_cd
, strengthcd=strength_cd
, gametm=game_tm
, playtm=play_tm
, remaintm=remain_tm
, eventcd=event_cd
))
for p in parsed_play:
intermfile.write('|' + str(p))
if event_cd == 'PENL':
penaltyfile.write('|' + str(p))
intermfile.write('\n')
if event_cd == 'PENL':
penaltyfile.write('\n')
# ========== end: helpful for debugging =============
process_play(event_cd, parsed_play, period_cd, parsed_assist)
print("Parse end")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment