Created
November 27, 2012 16:59
-
-
Save bdewilde/4155490 to your computer and use it in GitHub Desktop.
Get a dictionary of X-Men names as keys and Wikipedia URLs as corresponding values
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
# search Wikipedia for a subject or provide its URL, return the parsed HTML | |
# spoof the user-agent, let's pretend we're Firefox :) | |
def wikipedia_search(subject, url=False): | |
if url is False : | |
response = requests.get('http://en.wikipedia.org/w/index.php', | |
params={'search':subject}, | |
headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko)'}) | |
else : | |
response = requests.get(url, | |
headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko)'}) | |
soup = bs4.BeautifulSoup(response.text) | |
return soup | |
# search Wikipedia for the X-Men | |
# find list of members in side panel, return dictionary of names and URLs | |
def get_xmen(): | |
soup = wikipedia_search('X-men') | |
infobox = soup.find('table', class_='infobox') | |
members = infobox.find('th', text='Member(s)') | |
members = members.next_sibling.next_sibling | |
xmen = {} | |
for member in members.find_all('a') : | |
xmen[member.get_text()] = 'http://en.wikipedia.org'+member.get('href') | |
return xmen |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment