Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scraper for NCAA tournament games on Sports-Reference.com. Prints a JSON array of all games (excluding play-in rounds)
#!/usr/bin/env python3
#
# MIT License
#
# Copyright 2020 Hayden Schiff
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
from bs4 import BeautifulSoup
import requests
import sys
import json
import time
# print to stderr (because stdout is where the data goes)
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
# allows keep-alive for performance
session = requests.Session()
games = []
with session:
for year in range(1939,2020):
payload = {}#{'limit': limit}
url = 'https://www.sports-reference.com/cbb/postseason/'+str(year)+'-ncaa.html'
req = session.get(url, params=payload)
eprint(req.url)
# parse each tourney round from html response
soup = BeautifulSoup(req.text, 'html.parser')
bracketsNode = soup.find(id="brackets")
bracketsChildren = bracketsNode.find_all(True, recursive=False)
for bracketNode in bracketsChildren:
# prelimsNode = bracketNode.find('p')
# if (prelimsNode != None):
# pass
rounds = bracketNode.find_all('div', class_='round')
roundNum = 1
for round in rounds:
roundChildren = round.find_all(True, recursive=False)
for gameNode in roundChildren:
game = {}
game['year'] = year
game['bracket'] = bracketNode.get('id')
game['round'] = roundNum
gameChildren = gameNode.find_all(True, recursive=False)
if (len(gameChildren) != 3):
continue
# parse each team
def parseTeam(teamNode):
team = {}
classes = teamNode.get("class")
team['won'] = (classes != None) and ("winner" in teamNode.get("class"))
teamChildren = teamNode.find_all(True, recursive=False)
team['seed'] = teamChildren[0].get_text()
team['name'] = teamChildren[1].get_text()
team['score'] = teamChildren[2].get_text()
return team
game['teamA'] = parseTeam(gameChildren[0])
game['teamB'] = parseTeam(gameChildren[1])
locationLink = gameChildren[2].contents[0]
boxScore = locationLink.get("href")
boxScore = boxScore[len("/cbb/boxscores/"):len(boxScore)-len(".html")]
game['boxScore'] = boxScore
game['date'] = boxScore[0:len("0000-00-00")]
game['location'] = locationLink.get_text()[len("at "):]
games.append(game)
roundNum += 1
time.sleep(0.5)
print(json.dumps(games))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment