Skip to content

Instantly share code, notes, and snippets.

@s2t2
Created June 19, 2018 03:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save s2t2/3185105752b587ec36d57af5e6c3fded to your computer and use it in GitHub Desktop.
Save s2t2/3185105752b587ec36d57af5e6c3fded to your computer and use it in GitHub Desktop.
using Python to parse the ESPN website for MLB standings, which is definitely not as straightforward as it could be. there are a lot of nested tables to deal with.
from bs4 import BeautifulSoup
import requests
import pdb
import json
request_url = "https://www.espn.com/mlb/standings"
response = requests.get(request_url)
soup = BeautifulSoup(response.content, 'html.parser')
#for tr in soup.find_all("tr"):
# print("------------")
# #print(tr.text)
# cells = [element.text for element in tr.find_all("td")]
# print(len(cells))
# for td in cells:
# print(td)
#tables = soup.find_all("table")
#stats_rows = soup.find_all("tr", "Table2__tr")
#for stats_row in stats_rows:
# print("------------")
# print(type(stats_row))
# print(stats_row.text)
#table_cells = soup.find_all("td", "Table2__td")
#for cell in table_cells:
# print("------------")
# print(type(cell))
# print(cell.text)
#
# OBSERVATIONS:
#
# the table of names has the following classes:
# ... Table2__right-aligned Table2__table-fixed Table2__Table--fixed--left Table2__table
#
# the table of stats has the following classes:
# ... Table2__table-scroll
### names_table = soup.find_all("table", "Table2__table")
### names_cells = names_table[0].find_all("td") # use [0] workaround for "'ResultSet' object has no attribute 'find_all'", h/t: https://stackoverflow.com/a/24108608/670433
### for name_cell in names_cells:
### print(name_cell.text)
#> East
#> NYYNew York Yankees
#> BOSBoston Red Sox
#> TBTampa Bay Rays
#> TORToronto Blue Jays
#> BALBaltimore Orioles
#> Central
#> CLECleveland Indians
#> DETDetroit Tigers
#> MINMinnesota Twins
#> CHWChicago White Sox
#> KCKansas City Royals
#> West
#> HOUHouston Astros
#> SEASeattle Mariners
#> LAALos Angeles Angels
#> OAKOakland Athletics
#> TEXTexas Rangers
# getting closer....
#
# OBSERVATIONS:
#
# each td looks like this:
#
#<td class="Table2__td">
# <div class="team-link flex items-center clr-gray-03">
# <span class="pr2 TeamLink__Logo">
# <a data-clubhouse-uid="s:1~l:10~t:10" href="/mlb/team/_/name/nyy/new-york-yankees">
# <img data-clubhouse-uid="s:1~l:10~t:10" height="20" src="http://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/nyy.png&amp;h=20&amp;w=20" srcset="http://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/nyy.png&amp;h=40&amp;w=40 2x" width="20"/>
# </a>
# </span>
# <span class="dn show-mobile">
# <a data-clubhouse-uid="s:1~l:10~t:10" href="/mlb/team/_/name/nyy/new-york-yankees">
# <abbr data-clubhouse-uid="s:1~l:10~t:10" style="text-decoration:none" title="New York Yankees">
# NYY
# </abbr>
# </a>
# </span>
# <span class="hide-mobile">
# <a data-clubhouse-uid="s:1~l:10~t:10" href="/mlb/team/_/name/nyy/new-york-yankees">
# New York Yankees
# </a>
# </span>
# </div>
#</td>
# we can try pin-pointing the text of the second and third <span> elements...
#teams = []
#
#for name_cell in names_cells:
# spans = name_cell.find_all("span")
# team_name = spans[1].text
# team_abbrev = spans[2].text
# team = {"name": team_name, "abbrev": team_abbrev}
# teams.append(team)
# sometimes `spans[1].text` throws: IndexError: list index out of range
# ... so we have to handle this...
# teams = []
#
# for name_cell in names_cells:
# spans = name_cell.find_all("span")
# try:
# team_name = spans[1].text
# team_abbrev = spans[2].text
# team = {"name": team_name, "abbrev": team_abbrev}
# teams.append(team)
# except IndexError as e:
# print("--------- \n ERROR...")
# print(len(spans))
# print(name_cell.text)
#
#
#
#
# print(len(teams))
#
#> only shows 15 teams, need to loop through all the tables...
teams = []
#tables = soup.find_all("table", "Table2__table")
#print("FOUND", len(tables), "TABLES")
#for table in tables:
# cells = table.find_all("td")
# for cell in cells:
# spans = cell.find_all("span")
# try:
# team_name = spans[1].text
# team_abbrev = spans[2].text
# team = {"name": team_name, "abbrev": team_abbrev}
# teams.append(team)
# except IndexError as e:
# print("UNRECOGNIZED TEAMS DATA:", cell.text)
#
#for team in teams:
# print(team)
# this is parsing the names tables and the stats tables, but they need to be handled differently...
names_tables = soup.find_all("table", "Table2__right-aligned Table2__table-fixed Table2__Table--fixed--left Table2__table")
for table in names_tables:
cells = table.find_all("td")
for cell in cells:
spans = cell.find_all("span")
try:
team_name = spans[1].text
team_abbrev = spans[2].text
team = {"name": team_name, "abbrev": team_abbrev}
teams.append(team)
except IndexError as e:
print("UNRECOGNIZED TEAMS DATA:", cell.text) #> "East" ... "Central" ... "West"
for team in teams:
print(team)
#stats = []
#stats_tables = soup.find_all("table", "Table2__table-scroller Table2__right-aligned Table2__table")
#for table in stats_tables:
# rows = table.find_all("tr", "Table2__tr")
# for row in rows:
# print("------")
# print("ROW")
# print(row.text)
#
#
# almost, but output looks like:
#> ------
#> ROW
#> WLPCTGBHOMEAWAYRSRADIFFSTRKL10
#> ------
#> ROW
#> 4722.681-26-1121-11363265+98W17-3
#> ------
#> ROW
#> 4924.671-23-1126-13371263+108W16-4
#> ------
#> ROW
#> 3338.4651519-1914-19320342-22W37-3
#> ------
#> ROW
#> 3339.45815.515-1618-23284298-14L15-5
#> ------
#> ROW
#> 2050.28627.511-239-27248361-113W11-9
#> ------
#> ROW
#> WLPCTGBHOMEAWAYRSRADIFFSTRKL10
#> ------
#> ROW
#> 3833.535-22-1316-20340308+32W25-5
#> ------
#> ROW
#> 3637.493323-1713-20305328-23W57-3
#> ------
#> ROW
#> 3137.4565.517-1714-20290298-8L15-5
#> ------
#> ROW
#> 2447.3381412-2412-23267356-89L53-7
#> ------
#> ROW
#> 2250.30616.510-2712-23265404-139L71-9
#> ------
#> ROW
#> WLPCTGBHOMEAWAYRSRADIFFSTRKL10
#> ------
#> ROW
#> 4925.662-20-1429-11388230+158W1210-0
#> ------
#> ROW
#> 4626.639225-1421-12315293+22L17-3
#> ------
#> ROW
#> 3834.5281017-1821-16328298+30L24-6
#> ------
#> ROW
#> 3636.5001220-2016-16316322-6W25-5
#> ------
#> ROW
#> 3044.4051915-2415-20321396-75W33-7
#> ------
#> ROW
#> WLPCTGBHOMEAWAYRSRADIFFSTRKL10
#> ------
#> ROW
#> 4229.592-21-1321-16344277+67W26-4
#> ------
#> ROW
#> 3832.5433.516-1722-15297251+46L43-7
#> ------
#> ROW
#> 3832.5433.523-1215-20302294+8W36-4
#> ------
#> ROW
#> 3038.44110.513-2117-17255299-44W23-7
#> ------
#stats = []
#stats_tables = soup.find_all("table", "Table2__table-scroller Table2__right-aligned Table2__table")
#for table in stats_tables:
# rows = table.find_all("tr", "Table2__tr")
# for row in rows:
# pdb.set_trace()
#
# OBSERVATIONS:
# ... each row looks like:
#> <tr class="subgroup-headers Table2__sub-header Table2__tr Table2__tr--sm Table2__even" data-idx="0">
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content underline" title="Wins">
#> <a class="clr-gray-04" href="/mlb/standings/_/sort/wins/dir/desc">W</a>
#> </span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content underline" title="Losses">
#> <a class="clr-gray-04" href="/mlb/standings/_/sort/losses/dir/asc">L</a>
#> </span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content underline" title="Winning Percentage">
#> <a class="clr-gray-04" href="/mlb/standings/_/sort/winpercent/dir/desc">PCT</a>
#> </span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content underline" title="Games Back">
#> <a class="clr-gray-04" href="/mlb/standings/_/sort/gamesbehind/dir/desc">GB
#> <span class="dib arrow-icon_cont" style="width:10px;height:10px">
#> <svg class="w-70 icon__svg" viewbox="0 0 24 24"><use xlink:href="#icon__caret__up"></use></svg>
#> </span>
#> </a>
#> </span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content" title="Home Record">HOME</span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content" title="Away Record">AWAY</span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content underline" title="Runs scored">
#> <a class="clr-gray-04" href="/mlb/standings/_/sort/pointsfor/dir/desc">RS</a>
#> </span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content underline" title="Runs allowed">
#> <a class="clr-gray-04" href="/mlb/standings/_/sort/pointsagainst/dir/asc">RA</a>
#> </span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content underline" title="Run Differential">
#> <a class="clr-gray-04" href="/mlb/standings/_/sort/pointdifferential/dir/desc">DIFF</a>
#> </span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content underline" title="Current Streak">
#> <a class="clr-gray-04" href="/mlb/standings/_/sort/streak/dir/desc">STRK</a>
#> </span>
#> </td>
#> <td class="tar subHeader__item--content Table2__td">
#> <span class="fw-medium w-100 dib tar subHeader__item--content" title="Record last 10 games">L10</span>
#> </td>
#> </tr>
# wow...
stats = []
stats_tables = soup.find_all("table", "Table2__table-scroller Table2__right-aligned Table2__table")
for table in stats_tables:
rows = table.find_all("tr", "Table2__tr")
for row in rows:
cells = row.find_all("td")
# maybe lets keep these as a list for now. how about a list comprehension?
stat_values = [cell.text for cell in cells] #> ['W', 'L', 'PCT', 'GB', 'HOME', 'AWAY', 'RS', 'RA', 'DIFF', 'STRK', 'L10']
stats.append(stat_values)
for stat in stats:
print(stat)
#> ['W', 'L', 'PCT', 'GB', 'HOME', 'AWAY', 'RS', 'RA', 'DIFF', 'STRK', 'L10']
#> ['40', '28', '.588', '-', '19-13', '21-15', '337', '242', '+95', 'L1', '6-4']
#> ['42', '30', '.583', '-', '21-14', '21-16', '314', '266', '+48', 'L3', '5-5']
#> ['37', '33', '.529', '4', '21-18', '16-15', '299', '288', '+11', 'L1', '4-6']
#> ['36', '36', '.500', '6', '21-16', '15-20', '331', '328', '+3', 'W1', '5-5']
#> ['26', '45', '.366', '15.5', '11-23', '15-22', '301', '368', '-67', 'W1', '5-5']
#> ['W', 'L', 'PCT', 'GB', 'HOME', 'AWAY', 'RS', 'RA', 'DIFF', 'STRK', 'L10']
#> ['39', '32', '.549', '-', '23-16', '16-16', '312', '267', '+45', 'L2', '7-3']
#> ['37', '33', '.529', '1.5', '20-19', '17-14', '334', '278', '+56', 'L1', '7-3']
#> ['35', '37', '.486', '4.5', '19-11', '16-26', '298', '327', '-29', 'W1', '4-6']
#> ['34', '37', '.479', '5', '11-19', '23-18', '325', '369', '-44', 'L2', '2-8']
#> ['34', '40', '.459', '6.5', '18-21', '16-19', '283', '328', '-45', 'L2', '5-5']
from bs4 import BeautifulSoup
import requests
import pdb
import json
request_url = "https://www.espn.com/mlb/standings"
response = requests.get(request_url)
soup = BeautifulSoup(response.content, 'html.parser')
teams = []
names_tables = soup.find_all("table", "Table2__right-aligned Table2__table-fixed Table2__Table--fixed--left Table2__table")
for table in names_tables:
cells = table.find_all("td")
for cell in cells:
spans = cell.find_all("span")
try:
team_name = spans[1].text
team_abbrev = spans[2].text
team = {"name": team_name, "abbrev": team_abbrev}
teams.append(team)
except IndexError as e:
print("UNRECOGNIZED TEAMS DATA:", cell.text) #> "East" ... "Central" ... "West"
for team in teams:
print(team)
stats = []
stats_tables = soup.find_all("table", "Table2__table-scroller Table2__right-aligned Table2__table")
for table in stats_tables:
rows = table.find_all("tr", "Table2__tr")
for row in rows:
cells = row.find_all("td")
# maybe lets start with these as a list. how about a list comprehension?
# otherwise, and perhaps ideally, you could always convert to a dictionary
# with keys like: "wins", "losses", "win_pct", etc.
# and individually assign each stat to the proper attribute
stat_values = [cell.text for cell in cells] #> ['W', 'L', 'PCT', 'GB', 'HOME', 'AWAY', 'RS', 'RA', 'DIFF', 'STRK', 'L10']
stats.append(stat_values)
for stat in stats:
print(stat)
# todo: combine the teams list with the stats list
# ... assuming they are in the same order (which should absolutely be verified)
# ... then you can finally work with the rankings data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment