Skip to content

Instantly share code, notes, and snippets.

@tangotiger
Last active February 7, 2016 21:15
Show Gist options
  • Save tangotiger/c564479b1fb5cca860cd to your computer and use it in GitHub Desktop.
Save tangotiger/c564479b1fb5cca860cd to your computer and use it in GitHub Desktop.
Strip the Play by Play file so we are only left with the play records. Players On Ice not handled.
# takes 2 seconds to run per infile
# source for infile: http://www.nhl.com/scores/htmlreports/20152016/PL020666.HTM
from bs4 import BeautifulSoup
print("Parse start")
first_game_id = 1 #+665
last_game_id = 668 #-2
subseason_id = '02'
datafile_id = 'PL'
replace_str1 = "<br/>"
header_row = ('<table border="1" cellpaddin="4" cellspacing="0">'
'<tr bgcolor="#CCCCCC">'
'<td>EVENT_ID</td>'
'<td>PERIOD_CD</td>'
'<td>STRENGTH_CD</td>'
'<td>PLAY_TM</td>'
'<td>dummy1</td>'
'<td>REMAIN_TM</td>'
'<td>EVENT_CD</td>'
'<td>PLAY_TX</td>'
'<td>dummy2</td>'
'<td>ASSISTS_TX</td>'
'<td>dummy3</td>'
'</tr>')
for int_game_id in range(first_game_id, last_game_id+1):
game_id = str(int_game_id).zfill(4)
print(game_id)
sourcefile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
targetfile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
with open(sourcefile,'r') as infile, open(targetfile,'w') as outfile:
soup = BeautifulSoup(infile, "lxml")
tableRow = soup.findAll('tr', {'class': ('evenColor' , 'oddColor')})
outfile.write(header_row + '\n')
for r in tableRow:
tableRowData = r.findAll('td')
onice_flag = False
outfile.write("<tr>\n")
for d in tableRowData:
elements_in_td=len(d.contents) # the TD itself is made up of possibly more than one element
if onice_flag is False:
for i in range(0, elements_in_td):
if str(d.contents[i]).find("table") >= 0:
onice_flag = True
j=0
break
else:
printline=str(d.contents[i]).strip().replace(replace_str1,'')
outfile.write('<td>' + printline + '</td>\n')
else: # we are now in the OnIce Players section
for i in range(0, elements_in_td):
pass
outfile.write('</tr>\n')
outfile.write('</table>\n')
print("Parse end")
@bdilday
Copy link

bdilday commented Jan 25, 2016

a suggestion, partially tested,

import re

ifile = somepath + 'PL020668.HTM'
s = reduce(lambda a, b: a+b, [l.strip() for l in open(ifile).readlines()], '')
all_paragraphs = s.split('evenColor')
for a in all_paragraphs:
 onice_list = re.findall('title=(.+?)>', a)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment