Created
December 14, 2018 15:56
-
-
Save invegat/a8b9db107c5e76563012b209803753c9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 82, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 232 | |
}, | |
"colab_type": "code", | |
"executionInfo": { | |
"elapsed": 9973, | |
"status": "ok", | |
"timestamp": 1543497573910, | |
"user": { | |
"displayName": "Mark Oliver", | |
"photoUrl": "https://lh6.googleusercontent.com/-GKraQDFe4E4/AAAAAAAAAAI/AAAAAAAAAKU/9y_HHyEVntY/s64/photo.jpg", | |
"userId": "10622184328554051131" | |
}, | |
"user_tz": 300 | |
}, | |
"id": "ujHqm-_uxAZU", | |
"outputId": "e81ca817-498d-4197-f502-da60029521d5" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied: geopy in /home/mark/anaconda3/lib/python3.6/site-packages (1.17.0)\n", | |
"Requirement already satisfied: geographiclib<2,>=1.49 in /home/mark/anaconda3/lib/python3.6/site-packages (from geopy) (1.49)\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install geopy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 83, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Requirement already up-to-date: bs4 in /home/mark/anaconda3/lib/python3.6/site-packages (0.0.1)\r\n", | |
"Requirement already satisfied, skipping upgrade: beautifulsoup4 in /home/mark/anaconda3/lib/python3.6/site-packages (from bs4) (4.6.3)\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install bs4 --upgrade" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 84, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied: beautifulsoup4 in /home/mark/anaconda3/lib/python3.6/site-packages (4.6.3)\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install beautifulsoup4" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 85, | |
"metadata": { | |
"colab": {}, | |
"colab_type": "code", | |
"id": "SP1okPIYxMkq" | |
}, | |
"outputs": [], | |
"source": [ | |
"from requests import get\n", | |
"from contextlib import closing\n", | |
"from bs4 import BeautifulSoup\n", | |
"import bs4\n", | |
"import re\n", | |
"import regex\n", | |
"import pandas as pd\n", | |
"from geopy.distance import distance\n", | |
"import datetime" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 86, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'4.6.3'" | |
] | |
}, | |
"execution_count": 86, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bs4.__version__" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 87, | |
"metadata": { | |
"colab": {}, | |
"colab_type": "code", | |
"id": "u_gUpTtExfbI" | |
}, | |
"outputs": [], | |
"source": [ | |
"def simple_get(url):\n", | |
" \"\"\"\n", | |
" Attempts to get the content at `url` by making an HTTP GET request.\n", | |
" If the content-type of response is some kind of HTML/XML, return the\n", | |
" text content, otherwise return None.\n", | |
" \"\"\"\n", | |
" try:\n", | |
" with closing(get(url, stream=True)) as resp:\n", | |
" if is_good_response(resp):\n", | |
" return resp.content\n", | |
" else:\n", | |
" return None\n", | |
"\n", | |
" except RequestException as e:\n", | |
" log_error('Error during requests to {0} : {1}'.format(url, str(e)))\n", | |
" return None\n", | |
"\n", | |
"\n", | |
"def is_good_response(resp):\n", | |
" \"\"\"\n", | |
" Returns True if the response seems to be HTML, False otherwise.\n", | |
" \"\"\"\n", | |
" content_type = resp.headers['Content-Type'].lower()\n", | |
" return (resp.status_code == 200 \n", | |
" and content_type is not None \n", | |
" and content_type.find('html') > -1)\n", | |
"\n", | |
"\n", | |
"def log_error(e):\n", | |
" \"\"\"\n", | |
" It is always a good idea to log errors. \n", | |
" This function just prints them, but you can\n", | |
" make it do anything.\n", | |
" \"\"\"\n", | |
" print(e)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 420 | |
}, | |
"colab_type": "code", | |
"executionInfo": { | |
"elapsed": 530, | |
"status": "error", | |
"timestamp": 1543506310317, | |
"user": { | |
"displayName": "Mark Oliver", | |
"photoUrl": "https://lh6.googleusercontent.com/-GKraQDFe4E4/AAAAAAAAAAI/AAAAAAAAAKU/9y_HHyEVntY/s64/photo.jpg", | |
"userId": "10622184328554051131" | |
}, | |
"user_tz": 300 | |
}, | |
"id": "vEFKMCQPxzYG", | |
"outputId": "b4b8d592-905f-452d-815a-70fedd077c0f" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"30\n" | |
] | |
} | |
], | |
"source": [ | |
"#url = \"https://ftw.usatoday.com/2016/08/ranking-best-worst-mlb-stadiums-30-petco-wrigley-pnc\"\n", | |
"url = \"https://en.wikipedia.org/wiki/List_of_current_Major_League_Baseball_stadiums\"\n", | |
"raw_html = simple_get(url)\n", | |
"html = BeautifulSoup(raw_html, 'html.parser')\n", | |
"\n", | |
"table = html.select('table')[1]\n", | |
"teams = []\n", | |
"stadium = None\n", | |
"for tr in table.select('tr'):\n", | |
" if tr.has_attr('style'):\n", | |
" # print('tr style', tr['style'])\n", | |
" tds = [td for td in tr.select('td')]\n", | |
" if len(tds) > 6:\n", | |
" teams.append((tds[5].a.text, tds[1].a.text))\n", | |
" else :\n", | |
" stadium = tr.find_next('th', {'scope': 'row'}).a.text\n", | |
" tds = [td for td in tr.select('td')]\n", | |
" if len(tds) > 5:\n", | |
" team = tds[4].a.text\n", | |
" teams.append((team, stadium))\n", | |
" if team == 'New York Yankees':\n", | |
" break \n", | |
"\n", | |
"\n", | |
"print(len(teams))\n", | |
"#print(teams)\n", | |
"# for team in teams:\n", | |
"# print(team)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 89, | |
"metadata": { | |
"colab": {}, | |
"colab_type": "code", | |
"id": "3e-Y8zTzx7jV" | |
}, | |
"outputs": [], | |
"source": [ | |
"gurl = \"https://en.wikipedia.org/wiki/\"\n", | |
"p = regex.compile(r\"(\\-?[\\d\\.]+)\\;\\s+(\\-?[\\d\\.]+)\")\n", | |
"lat = 0\n", | |
"long = 0\n", | |
"rows = []\n", | |
"# teams = [('Arizona Diamondbacks','Chase Field' )]\n", | |
"# teams = sorted(teams, key = lambda team: team[1])\n", | |
"for team, stadium in teams:\n", | |
" # print(team, stadium)\n", | |
" raw_html = simple_get(gurl + stadium.replace(' ', '_')).decode(\"utf-8\") \n", | |
" html = BeautifulSoup(raw_html, 'html.parser')\n", | |
" geo = html.select('span .geo')\n", | |
" # print(f\"geo: {geo}\")\n", | |
" m = regex.match(p, geo[0].text)\n", | |
" rows.append({'team': team, 'stadium': stadium, 'lat': float(m.group(1)), 'long': float(m.group(2))})\n", | |
"teamd = pd.DataFrame(rows) \n", | |
"# teamd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 93, | |
"metadata": { | |
"colab": {}, | |
"colab_type": "code", | |
"id": "jCY0OOcmyCQX" | |
}, | |
"outputs": [], | |
"source": [ | |
"def print_distance(winner, loser, rows):\n", | |
" w = teamd[teamd['team'] == winner]\n", | |
" l = teamd[teamd['team'] == loser]\n", | |
" if len(w.lat.values) == 0:\n", | |
" print('unknown winning team', winner)\n", | |
" if len(l.lat.values) == 0:\n", | |
" print('unknown losing team', loser) \n", | |
" d = distance((w.lat.values[0], w.long.values[0]),\n", | |
" (l.lat.values[0], l.long.values[0])).miles\n", | |
" rows.append({'year': year, 'winner': winner, 'loser': loser, 'distance': d})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
" [after 1957 season mlb moves west](https://en.wikipedia.org/wiki/Major_League_Baseball_relocation_of_1950s%E2%80%9360s)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": { | |
"code_folding": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"wsurl = \"https://en.wikipedia.org/wiki/List_of_World_Series_champions\"\n", | |
"raw_html = simple_get(wsurl)\n", | |
"html = BeautifulSoup(raw_html, 'html.parser')\n", | |
"win = True\n", | |
"winner = None\n", | |
"rows = []\n", | |
"year = datetime.datetime.now().year\n", | |
"for tr in html.select('tr'):\n", | |
" th = tr.find_next('th', attrs={'scope':'row'})\n", | |
" a = th.a\n", | |
" series_year = int(a.text)\n", | |
"# if series_year == year:\n", | |
" # necessary iff world series hasn't been decided yet this year (picked my team to win)\n", | |
" # edit in the teams\n", | |
"# print_distance('Boston Red Sox', 'Los Angeles Dodgers', rows)\n", | |
"# break\n", | |
" if series_year > 1958: # before 1958 no west coast teams so no maximum distance \n", | |
" for td in tr.select('td'):\n", | |
" if (td.has_attr('style')):\n", | |
" tda = td.a\n", | |
" if win:\n", | |
" winner = tda.text\n", | |
" if winner == 'Florida Marlins': # convert old name to current name\n", | |
" winner = 'Miami Marlins'\n", | |
" elif winner == 'Anaheim Angels': # convert old name to current name\n", | |
" winner = 'Los Angeles Angels'\n", | |
" else:\n", | |
" print_distance(winner, tda.text, rows)\n", | |
" win = False if win else True\n", | |
" if series_year == year:\n", | |
" # necessary iff world series has been decided this year\n", | |
" break\n", | |
"# print(len(rows))\n", | |
"distances = pd.DataFrame(rows)\n", | |
"# distances" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"metadata": { | |
"colab": {}, | |
"colab_type": "code", | |
"id": "li96iKyMyUCl" | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"distance 2593.77\n", | |
"loser Los Angeles Dodgers\n", | |
"winner Boston Red Sox\n", | |
"year 2018\n", | |
"Name: 58, dtype: object" | |
] | |
}, | |
"execution_count": 95, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"distances.loc[distances['distance'].idxmax()]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"collapsed_sections": [], | |
"name": "Copy of Copy of mlb_ws_distance.ipynb", | |
"provenance": [ | |
{ | |
"file_id": "1VChAR0prxauP_xqr9h1rggEm2aAgj8x_", | |
"timestamp": 1543512835370 | |
}, | |
{ | |
"file_id": "19PZW-AZFvXQv2bBXWtj1XQuiJoT6iRXX", | |
"timestamp": 1540312807864 | |
} | |
], | |
"version": "0.3.2" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
}, | |
"latex_envs": { | |
"LaTeX_envs_menu_present": true, | |
"autoclose": false, | |
"autocomplete": true, | |
"bibliofile": "biblio.bib", | |
"cite_by": "apalike", | |
"current_citInitial": 1, | |
"eqLabelWithNumbers": true, | |
"eqNumInitial": 1, | |
"hotkeys": { | |
"equation": "Ctrl-E", | |
"itemize": "Ctrl-I" | |
}, | |
"labels_anchors": false, | |
"latex_user_defs": false, | |
"report_style_numbering": false, | |
"user_envs_cfg": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment