Skip to content

Instantly share code, notes, and snippets.

@alexpkeil1
Last active April 20, 2016 15:38
Show Gist options
  • Save alexpkeil1/341fc85e22f1b15591f7d471d5e54c79 to your computer and use it in GitHub Desktop.
Save alexpkeil1/341fc85e22f1b15591f7d471d5e54c79 to your computer and use it in GitHub Desktop.
scrape_newyorker.py
# coding: utf-8
# python-2.7
# new yorker caption contest
import requests
import bs4 # beautifulsoup
import mechanize
import cookielib
import getpass
def ny_url(num=500, pr=False):
mystr = "%02d" % num
theURL = "http://contest.newyorker.com/CaptionContest.aspx?tab=archive&id=" + str(num)
if pr:
print(theURL)
return theURL
def get_ny_html(theURL):
response = requests.get(theURL)
return response.text
def get_ny_winner(caplist=[500,503]):
'''
Retrieve a list of the winning captions in the New Yorker caption contest
Requires New Yorker login information
'''
ret=[]
theURL = ny_url(num=caplist[0], pr=True)
cj = cookielib.CookieJar()
br = mechanize.Browser()
br.set_cookiejar(cj)
br.open(theURL)
br.select_form(nr=2)
un = raw_input('Email: ')
pw = getpass.getpass('Password: ')
br.form['ContestLogin1$RegistrationEmail'] = un
br.form['ContestLogin1$RegistrationPassword'] = str(pw)
br.submit()
for cap in caplist:
theURLc = ny_url(num=cap)
thePAGE = br.open(theURLc)
nySoup = bs4.BeautifulSoup(thePAGE, 'html.parser')
ela = nySoup.find_all('span', attrs={'id':'ContestArchive1_WinningCaption'})
for el in ela:
ret.append(el.text)
print(el.text)
print('Done')
return ret
winners = get_ny_winner([i for i in range(300, 305)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment