Skip to content

Instantly share code, notes, and snippets.

@bdewilde
Created November 27, 2012 16:59
Show Gist options
  • Save bdewilde/4155490 to your computer and use it in GitHub Desktop.
Save bdewilde/4155490 to your computer and use it in GitHub Desktop.
Get a dictionary of X-Men names as keys and Wikipedia URLs as corresponding values
import requests
import bs4
# search Wikipedia for a subject or provide its URL, return the parsed HTML
# spoof the user-agent, let's pretend we're Firefox :)
def wikipedia_search(subject, url=False):
if url is False :
response = requests.get('http://en.wikipedia.org/w/index.php',
params={'search':subject},
headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko)'})
else :
response = requests.get(url,
headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko)'})
soup = bs4.BeautifulSoup(response.text)
return soup
# search Wikipedia for the X-Men
# find list of members in side panel, return dictionary of names and URLs
def get_xmen():
soup = wikipedia_search('X-men')
infobox = soup.find('table', class_='infobox')
members = infobox.find('th', text='Member(s)')
members = members.next_sibling.next_sibling
xmen = {}
for member in members.find_all('a') :
xmen[member.get_text()] = 'http://en.wikipedia.org'+member.get('href')
return xmen
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment