Skip to content

Instantly share code, notes, and snippets.

@tangotiger
Last active February 7, 2016 21:22
Show Gist options
  • Save tangotiger/91a1bc38284d60e2904f to your computer and use it in GitHub Desktop.
Save tangotiger/91a1bc38284d60e2904f to your computer and use it in GitHub Desktop.
Strip the Roster HTML file so we are only left with the TABLE rows we need
print("Parse start")
first_game_id = 1 #+665
last_game_id = 668 #-2
subseason_id = '02'
datafile_id = 'RO'
searchstr1 = '#</td>'
searchstr2 = 'Pos</td>'
searchstr3 = 'Name</td>'
searchstr4 = '</table>'
for int_game_id in range(first_game_id, last_game_id+1):
game_id = str(int_game_id).zfill(4)
print(game_id)
sourcefile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
targetfile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
with open(sourcefile,'r') as infile, open(targetfile,'w') as outfile:
searchstr1_flag = False
searchstr2_flag = False
searchstr3_flag = False
roster_flag = False
roster_count = 0
outfile.write('<table border="1" cellpadding="4" cellspacing="0">\n')
for line in infile:
linestr = str(line)
if roster_count < 2: # we haven't found both rosters
if roster_flag: # we have a roster
if (linestr.find(searchstr4) < 0): # we haven't reached the end of the roster
if len(linestr.strip()) > 0: # write out the roster line
outfile.write(linestr.strip())
outfile.write('\n')
else: # we reached the end of the roster, so start over for the next roster
searchstr1_flag = False
searchstr2_flag = False
searchstr3_flag = False
roster_flag = False
roster_count = roster_count + 1
else: # we haven't found the latest roster section
if searchstr1_flag is False: # we haven't found the 1st header element of roster
searchstr1_flag = (linestr.find(searchstr1) >= 0)
elif searchstr2_flag is False: # we haven't found the 2nd header element of roster
searchstr2_flag = (linestr.find(searchstr2) >= 0)
elif searchstr3_flag is False: # we haven't found the 3rd header element of roster
searchstr3_flag = (linestr.find(searchstr3) >= 0)
else:
roster_flag = True
outfile.write('<tr bgcolor="#CCCCCC">\n<td>Jersey</td>\n<td>Pos</td>\n<td>ROSTER{t}</td>\n</tr>\n'.format(t=roster_count))
outfile.write('</table>')
print("Parse end")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment