Last active
February 7, 2016 21:22
-
-
Save tangotiger/91a1bc38284d60e2904f to your computer and use it in GitHub Desktop.
Strip the Roster HTML file so we are only left with the TABLE rows we need
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print("Parse start") | |
first_game_id = 1 #+665 | |
last_game_id = 668 #-2 | |
subseason_id = '02' | |
datafile_id = 'RO' | |
searchstr1 = '#</td>' | |
searchstr2 = 'Pos</td>' | |
searchstr3 = 'Name</td>' | |
searchstr4 = '</table>' | |
for int_game_id in range(first_game_id, last_game_id+1): | |
game_id = str(int_game_id).zfill(4) | |
print(game_id) | |
sourcefile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id) | |
targetfile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id) | |
with open(sourcefile,'r') as infile, open(targetfile,'w') as outfile: | |
searchstr1_flag = False | |
searchstr2_flag = False | |
searchstr3_flag = False | |
roster_flag = False | |
roster_count = 0 | |
outfile.write('<table border="1" cellpadding="4" cellspacing="0">\n') | |
for line in infile: | |
linestr = str(line) | |
if roster_count < 2: # we haven't found both rosters | |
if roster_flag: # we have a roster | |
if (linestr.find(searchstr4) < 0): # we haven't reached the end of the roster | |
if len(linestr.strip()) > 0: # write out the roster line | |
outfile.write(linestr.strip()) | |
outfile.write('\n') | |
else: # we reached the end of the roster, so start over for the next roster | |
searchstr1_flag = False | |
searchstr2_flag = False | |
searchstr3_flag = False | |
roster_flag = False | |
roster_count = roster_count + 1 | |
else: # we haven't found the latest roster section | |
if searchstr1_flag is False: # we haven't found the 1st header element of roster | |
searchstr1_flag = (linestr.find(searchstr1) >= 0) | |
elif searchstr2_flag is False: # we haven't found the 2nd header element of roster | |
searchstr2_flag = (linestr.find(searchstr2) >= 0) | |
elif searchstr3_flag is False: # we haven't found the 3rd header element of roster | |
searchstr3_flag = (linestr.find(searchstr3) >= 0) | |
else: | |
roster_flag = True | |
outfile.write('<tr bgcolor="#CCCCCC">\n<td>Jersey</td>\n<td>Pos</td>\n<td>ROSTER{t}</td>\n</tr>\n'.format(t=roster_count)) | |
outfile.write('</table>') | |
print("Parse end") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment