Skip to content

Instantly share code, notes, and snippets.

@invegat
Created December 14, 2018 15:56
Show Gist options
  • Save invegat/a8b9db107c5e76563012b209803753c9 to your computer and use it in GitHub Desktop.
Save invegat/a8b9db107c5e76563012b209803753c9 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 232
},
"colab_type": "code",
"executionInfo": {
"elapsed": 9973,
"status": "ok",
"timestamp": 1543497573910,
"user": {
"displayName": "Mark Oliver",
"photoUrl": "https://lh6.googleusercontent.com/-GKraQDFe4E4/AAAAAAAAAAI/AAAAAAAAAKU/9y_HHyEVntY/s64/photo.jpg",
"userId": "10622184328554051131"
},
"user_tz": 300
},
"id": "ujHqm-_uxAZU",
"outputId": "e81ca817-498d-4197-f502-da60029521d5"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: geopy in /home/mark/anaconda3/lib/python3.6/site-packages (1.17.0)\n",
"Requirement already satisfied: geographiclib<2,>=1.49 in /home/mark/anaconda3/lib/python3.6/site-packages (from geopy) (1.49)\n"
]
}
],
"source": [
"!pip install geopy"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already up-to-date: bs4 in /home/mark/anaconda3/lib/python3.6/site-packages (0.0.1)\r\n",
"Requirement already satisfied, skipping upgrade: beautifulsoup4 in /home/mark/anaconda3/lib/python3.6/site-packages (from bs4) (4.6.3)\r\n"
]
}
],
"source": [
"!pip install bs4 --upgrade"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: beautifulsoup4 in /home/mark/anaconda3/lib/python3.6/site-packages (4.6.3)\r\n"
]
}
],
"source": [
"!pip install beautifulsoup4"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "SP1okPIYxMkq"
},
"outputs": [],
"source": [
"from requests import get\n",
"from contextlib import closing\n",
"from bs4 import BeautifulSoup\n",
"import bs4\n",
"import re\n",
"import regex\n",
"import pandas as pd\n",
"from geopy.distance import distance\n",
"import datetime"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'4.6.3'"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bs4.__version__"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "u_gUpTtExfbI"
},
"outputs": [],
"source": [
"def simple_get(url):\n",
" \"\"\"\n",
" Attempts to get the content at `url` by making an HTTP GET request.\n",
" If the content-type of response is some kind of HTML/XML, return the\n",
" text content, otherwise return None.\n",
" \"\"\"\n",
" try:\n",
" with closing(get(url, stream=True)) as resp:\n",
" if is_good_response(resp):\n",
" return resp.content\n",
" else:\n",
" return None\n",
"\n",
" except RequestException as e:\n",
" log_error('Error during requests to {0} : {1}'.format(url, str(e)))\n",
" return None\n",
"\n",
"\n",
"def is_good_response(resp):\n",
" \"\"\"\n",
" Returns True if the response seems to be HTML, False otherwise.\n",
" \"\"\"\n",
" content_type = resp.headers['Content-Type'].lower()\n",
" return (resp.status_code == 200 \n",
" and content_type is not None \n",
" and content_type.find('html') > -1)\n",
"\n",
"\n",
"def log_error(e):\n",
" \"\"\"\n",
" It is always a good idea to log errors. \n",
" This function just prints them, but you can\n",
" make it do anything.\n",
" \"\"\"\n",
" print(e)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 420
},
"colab_type": "code",
"executionInfo": {
"elapsed": 530,
"status": "error",
"timestamp": 1543506310317,
"user": {
"displayName": "Mark Oliver",
"photoUrl": "https://lh6.googleusercontent.com/-GKraQDFe4E4/AAAAAAAAAAI/AAAAAAAAAKU/9y_HHyEVntY/s64/photo.jpg",
"userId": "10622184328554051131"
},
"user_tz": 300
},
"id": "vEFKMCQPxzYG",
"outputId": "b4b8d592-905f-452d-815a-70fedd077c0f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30\n"
]
}
],
"source": [
"#url = \"https://ftw.usatoday.com/2016/08/ranking-best-worst-mlb-stadiums-30-petco-wrigley-pnc\"\n",
"url = \"https://en.wikipedia.org/wiki/List_of_current_Major_League_Baseball_stadiums\"\n",
"raw_html = simple_get(url)\n",
"html = BeautifulSoup(raw_html, 'html.parser')\n",
"\n",
"table = html.select('table')[1]\n",
"teams = []\n",
"stadium = None\n",
"for tr in table.select('tr'):\n",
" if tr.has_attr('style'):\n",
" # print('tr style', tr['style'])\n",
" tds = [td for td in tr.select('td')]\n",
" if len(tds) > 6:\n",
" teams.append((tds[5].a.text, tds[1].a.text))\n",
" else :\n",
" stadium = tr.find_next('th', {'scope': 'row'}).a.text\n",
" tds = [td for td in tr.select('td')]\n",
" if len(tds) > 5:\n",
" team = tds[4].a.text\n",
" teams.append((team, stadium))\n",
" if team == 'New York Yankees':\n",
" break \n",
"\n",
"\n",
"print(len(teams))\n",
"#print(teams)\n",
"# for team in teams:\n",
"# print(team)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "3e-Y8zTzx7jV"
},
"outputs": [],
"source": [
"gurl = \"https://en.wikipedia.org/wiki/\"\n",
"p = regex.compile(r\"(\\-?[\\d\\.]+)\\;\\s+(\\-?[\\d\\.]+)\")\n",
"lat = 0\n",
"long = 0\n",
"rows = []\n",
"# teams = [('Arizona Diamondbacks','Chase Field' )]\n",
"# teams = sorted(teams, key = lambda team: team[1])\n",
"for team, stadium in teams:\n",
" # print(team, stadium)\n",
" raw_html = simple_get(gurl + stadium.replace(' ', '_')).decode(\"utf-8\") \n",
" html = BeautifulSoup(raw_html, 'html.parser')\n",
" geo = html.select('span .geo')\n",
" # print(f\"geo: {geo}\")\n",
" m = regex.match(p, geo[0].text)\n",
" rows.append({'team': team, 'stadium': stadium, 'lat': float(m.group(1)), 'long': float(m.group(2))})\n",
"teamd = pd.DataFrame(rows) \n",
"# teamd"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jCY0OOcmyCQX"
},
"outputs": [],
"source": [
"def print_distance(winner, loser, rows):\n",
" w = teamd[teamd['team'] == winner]\n",
" l = teamd[teamd['team'] == loser]\n",
" if len(w.lat.values) == 0:\n",
" print('unknown winning team', winner)\n",
" if len(l.lat.values) == 0:\n",
" print('unknown losing team', loser) \n",
" d = distance((w.lat.values[0], w.long.values[0]),\n",
" (l.lat.values[0], l.long.values[0])).miles\n",
" rows.append({'year': year, 'winner': winner, 'loser': loser, 'distance': d})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" [after 1957 season mlb moves west](https://en.wikipedia.org/wiki/Major_League_Baseball_relocation_of_1950s%E2%80%9360s)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"code_folding": []
},
"outputs": [],
"source": [
"wsurl = \"https://en.wikipedia.org/wiki/List_of_World_Series_champions\"\n",
"raw_html = simple_get(wsurl)\n",
"html = BeautifulSoup(raw_html, 'html.parser')\n",
"win = True\n",
"winner = None\n",
"rows = []\n",
"year = datetime.datetime.now().year\n",
"for tr in html.select('tr'):\n",
" th = tr.find_next('th', attrs={'scope':'row'})\n",
" a = th.a\n",
" series_year = int(a.text)\n",
"# if series_year == year:\n",
" # necessary iff world series hasn't been decided yet this year (picked my team to win)\n",
" # edit in the teams\n",
"# print_distance('Boston Red Sox', 'Los Angeles Dodgers', rows)\n",
"# break\n",
" if series_year > 1958: # before 1958 no west coast teams so no maximum distance \n",
" for td in tr.select('td'):\n",
" if (td.has_attr('style')):\n",
" tda = td.a\n",
" if win:\n",
" winner = tda.text\n",
" if winner == 'Florida Marlins': # convert old name to current name\n",
" winner = 'Miami Marlins'\n",
" elif winner == 'Anaheim Angels': # convert old name to current name\n",
" winner = 'Los Angeles Angels'\n",
" else:\n",
" print_distance(winner, tda.text, rows)\n",
" win = False if win else True\n",
" if series_year == year:\n",
" # necessary iff world series has been decided this year\n",
" break\n",
"# print(len(rows))\n",
"distances = pd.DataFrame(rows)\n",
"# distances"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "li96iKyMyUCl"
},
"outputs": [
{
"data": {
"text/plain": [
"distance 2593.77\n",
"loser Los Angeles Dodgers\n",
"winner Boston Red Sox\n",
"year 2018\n",
"Name: 58, dtype: object"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distances.loc[distances['distance'].idxmax()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "Copy of Copy of mlb_ws_distance.ipynb",
"provenance": [
{
"file_id": "1VChAR0prxauP_xqr9h1rggEm2aAgj8x_",
"timestamp": 1543512835370
},
{
"file_id": "19PZW-AZFvXQv2bBXWtj1XQuiJoT6iRXX",
"timestamp": 1540312807864
}
],
"version": "0.3.2"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autoclose": false,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment