-
-
Save pybokeh/fb51cca01f1c8ec66149 to your computer and use it in GitHub Desktop.
For_UsernamesArentClever
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:3a2534242d93241c77c4b42295ee127440c6737576bd9721d6509efc7430d47c" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import urllib.request as request\n", | |
"from bs4 import BeautifulSoup\n", | |
"import re\n", | |
"\n", | |
"base_url = 'http://espn.go.com'\n", | |
"\n", | |
"teams_url = 'http://espn.go.com/nba/teams'\n", | |
"html_teams = request.urlopen(teams_url)\n", | |
"\n", | |
"soup_teams = BeautifulSoup(html_teams)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"urls = soup_teams.find_all(href=re.compile('/nba/teams/stats'))\n", | |
"urls" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"[<a href=\"/nba/teams/stats?team=bos\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=bkn\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=nyk\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=phi\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=tor\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=gsw\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=lac\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=lal\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=pho\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=sac\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=chi\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=cle\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=det\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=ind\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=mil\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=dal\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=hou\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=mem\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=nor\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=sas\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=atl\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=cha\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=mia\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=orl\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=was\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=den\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=min\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=okc\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=por\">Stats</a>,\n", | |
" <a href=\"/nba/teams/stats?team=uth\">Stats</a>]" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 'urls' above contains all the team URLs" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### This is the first url:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"urls[0]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
"<a href=\"/nba/teams/stats?team=bos\">Stats</a>" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### But I want ONLY the text assigned to 'href='. You do that by doing urls[0]['href']:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"urls[0]['href']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
"'/nba/teams/stats?team=bos'" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### To get all the href values from all the URLs, you just loop through the URLs:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for url in urls:\n", | |
" print(url['href'])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"/nba/teams/stats?team=bos\n", | |
"/nba/teams/stats?team=bkn\n", | |
"/nba/teams/stats?team=nyk\n", | |
"/nba/teams/stats?team=phi\n", | |
"/nba/teams/stats?team=tor\n", | |
"/nba/teams/stats?team=gsw\n", | |
"/nba/teams/stats?team=lac\n", | |
"/nba/teams/stats?team=lal\n", | |
"/nba/teams/stats?team=pho\n", | |
"/nba/teams/stats?team=sac\n", | |
"/nba/teams/stats?team=chi\n", | |
"/nba/teams/stats?team=cle\n", | |
"/nba/teams/stats?team=det\n", | |
"/nba/teams/stats?team=ind\n", | |
"/nba/teams/stats?team=mil\n", | |
"/nba/teams/stats?team=dal\n", | |
"/nba/teams/stats?team=hou\n", | |
"/nba/teams/stats?team=mem\n", | |
"/nba/teams/stats?team=nor\n", | |
"/nba/teams/stats?team=sas\n", | |
"/nba/teams/stats?team=atl\n", | |
"/nba/teams/stats?team=cha\n", | |
"/nba/teams/stats?team=mia\n", | |
"/nba/teams/stats?team=orl\n", | |
"/nba/teams/stats?team=was\n", | |
"/nba/teams/stats?team=den\n", | |
"/nba/teams/stats?team=min\n", | |
"/nba/teams/stats?team=okc\n", | |
"/nba/teams/stats?team=por\n", | |
"/nba/teams/stats?team=uth\n" | |
] | |
} | |
], | |
"prompt_number": 12 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment