Skip to content

Instantly share code, notes, and snippets.

@tangotiger
Last active February 7, 2016 21:19
Show Gist options
  • Save tangotiger/f141cbf1de1285a522a0 to your computer and use it in GitHub Desktop.
Save tangotiger/f141cbf1de1285a522a0 to your computer and use it in GitHub Desktop.
Convert HTML file into csv
from bs4 import BeautifulSoup
print("Parse start")
first_game_id = 1 #+665
last_game_id = 668 #-2
season_id = '20152016'
subseason_id = '02'
datafile_id = 'RO'
header_row = 'SEASON_ID' \
'|SUBSEASON_ID' \
'|GAME_ID' \
'|HOME_ID' \
'|JERSEY_ID' \
'|POS_CD' \
'|PLAYER_TX'
searchstr1 = 'ROSTER'
searchstr2 = '(' # used to strip out (C) and (A) from player name
roster = 0
for int_game_id in range(first_game_id, last_game_id+1):
game_id = str(int_game_id).zfill(4)
print(game_id)
sourcefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
targetfile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\parsed_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
with open(sourcefile,'r') as infile, open(targetfile,'w') as outfile:
soup = BeautifulSoup(infile, "lxml")
tableRow = soup.findAll('tr')
outfile.write(header_row + '\n')
for r in tableRow:
tableRowData = r.findAll('td')
if tableRowData[2].getText().find(searchstr1) >= 0: # check which of the two teams we have
roster = tableRowData[2].getText()[-1:]
else:
outfile.write('{season}|{subseason}|{game}|{home}|{jersey}|{pos}|{player}\n'.format(
season=season_id
, subseason=subseason_id
, game=game_id
, home=roster
, jersey=tableRowData[0].getText()
, pos=tableRowData[1].getText()
, player=tableRowData[2].getText().split(searchstr2)[0].strip()
))
print("Parse end")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment