tangotiger/parseRoster.py

## parseRoster.py
from bs4 import BeautifulSoup

print("Parse start")

first_game_id = 1  #+665
last_game_id = 668 #-2

season_id = '20152016'
subseason_id = '02'
datafile_id = 'RO'

header_row = 'SEASON_ID' \
             '|SUBSEASON_ID' \
             '|GAME_ID' \
             '|HOME_ID' \
             '|JERSEY_ID' \
             '|POS_CD' \
             '|PLAYER_TX'

searchstr1 = 'ROSTER'
searchstr2 = '('    # used to strip out (C) and (A) from player name

roster = 0

for int_game_id in range(first_game_id, last_game_id+1):
    game_id = str(int_game_id).zfill(4)
    print(game_id)

    sourcefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
    targetfile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\parsed_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)

    with open(sourcefile,'r') as infile, open(targetfile,'w') as outfile:
        soup = BeautifulSoup(infile, "lxml")
        tableRow = soup.findAll('tr')
        outfile.write(header_row + '\n')
        for r in tableRow:
            tableRowData = r.findAll('td')
            if tableRowData[2].getText().find(searchstr1) >= 0: # check which of the two teams we have
                roster = tableRowData[2].getText()[-1:]
            else:
                outfile.write('{season}|{subseason}|{game}|{home}|{jersey}|{pos}|{player}\n'.format(
                        season=season_id
                        , subseason=subseason_id
                        , game=game_id
                        , home=roster
                        , jersey=tableRowData[0].getText()
                        , pos=tableRowData[1].getText()
                        , player=tableRowData[2].getText().split(searchstr2)[0].strip()
                ))

print("Parse end")
	from bs4 import BeautifulSoup

	print("Parse start")

	first_game_id = 1 #+665
	last_game_id = 668 #-2

	season_id = '20152016'
	subseason_id = '02'
	datafile_id = 'RO'

	header_row = 'SEASON_ID' \
	'\|SUBSEASON_ID' \
	'\|GAME_ID' \
	'\|HOME_ID' \
	'\|JERSEY_ID' \
	'\|POS_CD' \
	'\|PLAYER_TX'

	searchstr1 = 'ROSTER'
	searchstr2 = '(' # used to strip out (C) and (A) from player name

	roster = 0

	for int_game_id in range(first_game_id, last_game_id+1):
	game_id = str(int_game_id).zfill(4)
	print(game_id)

	sourcefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
	targetfile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\parsed_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)

	with open(sourcefile,'r') as infile, open(targetfile,'w') as outfile:
	soup = BeautifulSoup(infile, "lxml")
	tableRow = soup.findAll('tr')
	outfile.write(header_row + '\n')
	for r in tableRow:
	tableRowData = r.findAll('td')
	if tableRowData[2].getText().find(searchstr1) >= 0: # check which of the two teams we have
	roster = tableRowData[2].getText()[-1:]
	else:
	outfile.write('{season}\|{subseason}\|{game}\|{home}\|{jersey}\|{pos}\|{player}\n'.format(
	season=season_id
	, subseason=subseason_id
	, game=game_id
	, home=roster
	, jersey=tableRowData[0].getText()
	, pos=tableRowData[1].getText()
	, player=tableRowData[2].getText().split(searchstr2)[0].strip()
	))

	print("Parse end")