Skip to content

Instantly share code, notes, and snippets.

@jweissbock
Created April 24, 2013 01:53
Show Gist options
  • Save jweissbock/5448995 to your computer and use it in GitHub Desktop.
Save jweissbock/5448995 to your computer and use it in GitHub Desktop.
Gets the data daily for the teams before and after the games. getData.py was set up as a cron job and was able to figure out which information to grab from each team, bit of a hack job dailyData.py gets the data and stats for each team before the game, normally you would pass a list of teams via command line dailyScores.py gets the scores of the…
from bs4 import BeautifulSoup
import urllib2
import csv
import datetime
import sys
"""teams = ['Washington', 'Boston', 'NY Rangers', 'Pittsburgh', 'Ottawa', 'Buffalo', 'Minnesota', 'Colorado', 'Winnipeg', 'Toronto', 'Montreal',
'New Jersey', 'Carolina', 'Tampa Bay', 'Phoenix', 'Columbus', 'NY Islanders', 'Florida', 'Anaheim', 'St Louis', 'Chicago',
'Dallas', 'Detroit', 'Vancouver', 'San Jose', 'Los Angeles']"""
teams = sys.argv[1:]
if len(teams) == 0:
print "No supplied teams"
sys.exit()
# so far doesnt like NY Islanders, and CGY but only in behindthenet sites
finalData = [[None]*11 for _ in range(len(teams))]
teamToAbrv = {'Phoenix': "PHX", 'Philadelphia': 'PHI', 'Ottawa': 'OTT', 'NY Rangers': "NYR", 'Nashville': 'NSH', 'NY Islanders': 'NYI',
'Pittsburgh': 'PIT', 'San Jose': "S.J", 'Washington': 'WSH', 'Vancouver': 'VAN', 'Toronto': 'TOR', 'St Louis': 'STL',
'Tampa Bay': 'T.B', 'New Jersey': 'N.J', 'Montreal': "MTL", "Columbus": 'CBJ', 'Calgary': 'CGY', 'Carolina': "CAR",
'Buffalo': 'BUF', 'Winnipeg': 'WPG', 'Boston': 'BOS', 'Chicago': 'CHI', 'Colorado': 'COL', 'Los Angeles': 'L.A',
'Minnesota': 'MIN', 'Florida': 'FLA', 'Edmonton': 'EDM', 'Dallas': 'DAL', 'Detroit': 'DET', 'Anaheim': "ANA",
'Winnipeg': 'ATL'}
todaysTeams = [teamToAbrv[k] for k in teams if k in teamToAbrv]
teams = [t.lower() for t in teams]
# for each team we need to get their stats
# [Team, FenwickClose, Goals For, Goals Against, PP%, PK%, sh%, sv%, winstreak, standings, 5-5 F/A]
# FenwickClose
fenwickURL = 'http://behindthenet.ca/fenwick_2012.php'
request = urllib2.Request(fenwickURL)
response = urllib2.urlopen(request)
the_page = response.read()
soup = BeautifulSoup(the_page)
rows = soup.findAll('tr')[2:]
# loop through each row
for r in rows:
allTDs = r.findAll('td')
t = allTDs[0].text
# if a team in a row is a team we are looking for
# then store its appropriate value into the final data
if t in todaysTeams:
tKey = todaysTeams.index(t)
finalData[tKey][1] = allTDs[6].text
"""# goals for
gfURL = 'http://www.nhl.com/ice/teamstats.htm?fetchKey=20132ALLSAAAll&sort=avgGoalsPerGame&viewName=goalsFor'
request = urllib2.Request(gfURL)
response = urllib2.urlopen(request)
the_page = response.read()
soup = BeautifulSoup(the_page)
rows = soup.findAll('table', 'data stats')[0].findAll('tr')[2:]
# loop through each row
for r in rows:
allTDs = r.findAll('td')
t = allTDs[1].text.lower()
# if a team in a row is a team we are looking for
# then store its appropriate value into the final data
if t in teams:
tKey = teams.index(t)
finalData[tKey][2] = allTDs[14].text
# goals against
gaURL = 'http://www.nhl.com/ice/teamstats.htm?fetchKey=20132ALLSAAAll&sort=avgGoalsAgainstPerGame&viewName=goalsAgainst'
request = urllib2.Request(gaURL)
response = urllib2.urlopen(request)
the_page = response.read()
soup = BeautifulSoup(the_page)
rows = soup.findAll('table', 'data stats')[0].findAll('tr')[2:]
# loop through each row
for r in rows:
allTDs = r.findAll('td')
t = allTDs[1].text.lower()
# if a team in a row is a team we are looking for
# then store its appropriate value into the final data
if t in teams:
tKey = teams.index(t)
finalData[tKey][3] = allTDs[14].text
"""
# pp%, pk%
ppURL = "http://www.nhl.com/ice/teamstats.htm"
request = urllib2.Request(ppURL)
response = urllib2.urlopen(request)
the_page = response.read()
soup = BeautifulSoup(the_page)
rows = soup.findAll('table', 'data stats')[0].findAll('tr')[2:]
# loop through each row
for r in rows:
allTDs = r.findAll('td')
t = allTDs[1].text.lower()
# if a team in a row is a team we are looking for
# then store its appropriate value into the final data
if t in teams:
tKey = teams.index(t)
finalData[tKey][4] = allTDs[11].text
finalData[tKey][5] = allTDs[12].text
# we can also get the teams 5-5 F/A Ratio
finalData[tKey][10] = allTDs[10].text
# sh%, #sv%
PDOurl = 'http://www.behindthenet.ca/2012/team_data3.php'
request = urllib2.Request(PDOurl)
response = urllib2.urlopen(request)
the_page = response.read()
soup = BeautifulSoup(the_page)
rows = soup.findAll('tr')[2:]
# loop through each row
for r in rows:
allTDs = r.findAll('td')
t = allTDs[0].text
# if a team in a row is a team we are looking for
# then store its appropriate value into the final data
if t in todaysTeams:
tKey = todaysTeams.index(t)
finalData[tKey][6] = allTDs[16].text.strip()
finalData[tKey][7] = allTDs[19].text.strip()
# win streak & standings
winURL = "http://www.tsn.ca/nhl/standings/"
request = urllib2.Request(winURL)
response = urllib2.urlopen(request)
the_page = response.read()
soup = BeautifulSoup(the_page)
rows = soup.findAll('table')[0]
rows2 = rows.findAll('tbody')[1]
rows = rows.findAll('tbody')[0]
rows = rows.findAll('tr')[0:8] + rows.findAll('tr')[9:16]
rows = rows + rows2.findAll('tr')[0:8] + rows2.findAll('tr')[9:16]
# loop through each row
for r in rows:
allTDs = r.findAll('td')
allTHs = r.findAll('th')
t = allTDs[0].findAll('a')[0].text.strip().lower()
# if a team in a row is a team we are looking for
# then store its appropriate value into the final data
if t in teams:
tKey = teams.index(t)
finalData[tKey][8] = allTDs[6].text
finalData[tKey][8] = finalData[tKey][8].replace('Lost', '-').replace('OT', '-')
finalData[tKey][8] = finalData[tKey][8].replace('Won', '').replace(' ', '').strip()
finalData[tKey][9] = allTHs[0].text
finalData[tKey][2] = allTHs[4].text # GF
finalData[tKey][3] = allTDs[4].text # GA
i = 0
for t in teams:
finalData[i][0] = t.title()
i = i + 1
print finalData
now = datetime.datetime.now()
fileName = "/Users/joshuaweissbock/Dropbox/CSI-5388/Project/dailydata/"+str(now.day)+"-"+str(now.month)+"-"+str(now.year)+".csv"
myfile = open(fileName, 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
for line in finalData:
wr.writerow(line)
print "Wrote file"
# great, let's dump into a CSV
from bs4 import BeautifulSoup
import urllib2
import csv
import datetime
import sys
if len(sys.argv[1:]) == 0:
print "No supplied URLs"
sys.exit()
gameIDs = sys.argv[1:]
# [TeamAway, Win/Loss, GF, GA, ShFor, ShA]
# [TeamHome, Loss/Win, GA, GF, ShFor, ShA]
now = datetime.datetime.now()
fileName = "/Users/joshuaweissbock/Dropbox/CSI-5388/Project/dailyscores/"+str(now.day)+"-"+str(now.month)+"-"+str(now.year)+".csv"
myfile = open(fileName, 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
# loop through all gameIDs
for g in gameIDs:
fenwickURL = "http://www.tsn.ca"+g
request = urllib2.Request(fenwickURL)
response = urllib2.urlopen(request)
the_page = response.read()
soup = BeautifulSoup(the_page)
box = soup.findAll('table', 'boxScore') # store the tables
col = len(box[0].findAll('tr')[1].findAll('td')) # count if it is OT or not
#print col
TeamAway = box[0].findAll('tr')[1].findAll('td')[0].text # get the home/away team name
TeamHome = box[0].findAll('tr')[2].findAll('td')[0].text
GF = box[0].findAll('tr')[1].findAll('td')[col-1].text # get the teams scores
GA = box[0].findAll('tr')[2].findAll('td')[col-1].text
statusAway = "Win" if GF > GA else "Loss" # determine who won
statusHome = "Loss" if statusAway == "Win" else "Win"
try:
isSO = box[0].findAll('tr')[0].findAll('th')[5].text
sub = 2 if isSO == 'SO' else 1
except:
sub = 1
ShFor = box[1].findAll('tr')[1].findAll('td')[col-sub].text # get shots for
ShA = box[1].findAll('tr')[2].findAll('td')[col-sub].text
Line1 = [TeamAway, statusAway, GF, GA, ShFor, ShA] # store them in the right format
Line2 = [TeamHome, statusHome, GA, GF, ShA, ShFor]
# output the lines
print Line1
print Line2
# write to csv
wr.writerow(Line1)
wr.writerow(Line2)
from bs4 import BeautifulSoup
import urllib2
import csv
import datetime
import sys
if len(sys.argv[1:]) == 0:
print "No supplied URLs"
sys.exit()
gameIDs = sys.argv[1:]
# [TeamAway, Win/Loss, GF, GA, ShFor, ShA]
# [TeamHome, Loss/Win, GA, GF, ShFor, ShA]
now = datetime.datetime.now()
fileName = "/Users/joshuaweissbock/Dropbox/CSI-5388/Project/dailyscores/"+str(now.day)+"-"+str(now.month)+"-"+str(now.year)+".csv"
myfile = open(fileName, 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
# loop through all gameIDs
for g in gameIDs:
fenwickURL = "http://www.tsn.ca"+g
request = urllib2.Request(fenwickURL)
response = urllib2.urlopen(request)
the_page = response.read()
soup = BeautifulSoup(the_page)
box = soup.findAll('table', 'boxScore') # store the tables
col = len(box[0].findAll('tr')[1].findAll('td')) # count if it is OT or not
#print col
TeamAway = box[0].findAll('tr')[1].findAll('td')[0].text # get the home/away team name
TeamHome = box[0].findAll('tr')[2].findAll('td')[0].text
GF = box[0].findAll('tr')[1].findAll('td')[col-1].text # get the teams scores
GA = box[0].findAll('tr')[2].findAll('td')[col-1].text
statusAway = "Win" if GF > GA else "Loss" # determine who won
statusHome = "Loss" if statusAway == "Win" else "Win"
try:
isSO = box[0].findAll('tr')[0].findAll('th')[5].text
sub = 2 if isSO == 'SO' else 1
except:
sub = 1
ShFor = box[1].findAll('tr')[1].findAll('td')[col-sub].text # get shots for
ShA = box[1].findAll('tr')[2].findAll('td')[col-sub].text
Line1 = [TeamAway, statusAway, GF, GA, ShFor, ShA] # store them in the right format
Line2 = [TeamHome, statusHome, GA, GF, ShA, ShFor]
# output the lines
print Line1
print Line2
# write to csv
wr.writerow(Line1)
wr.writerow(Line2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment