Last active
April 20, 2016 15:38
-
-
Save alexpkeil1/341fc85e22f1b15591f7d471d5e54c79 to your computer and use it in GitHub Desktop.
scrape_newyorker.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# python-2.7 | |
# new yorker caption contest | |
import requests | |
import bs4 # beautifulsoup | |
import mechanize | |
import cookielib | |
import getpass | |
def ny_url(num=500, pr=False): | |
mystr = "%02d" % num | |
theURL = "http://contest.newyorker.com/CaptionContest.aspx?tab=archive&id=" + str(num) | |
if pr: | |
print(theURL) | |
return theURL | |
def get_ny_html(theURL): | |
response = requests.get(theURL) | |
return response.text | |
def get_ny_winner(caplist=[500,503]): | |
''' | |
Retrieve a list of the winning captions in the New Yorker caption contest | |
Requires New Yorker login information | |
''' | |
ret=[] | |
theURL = ny_url(num=caplist[0], pr=True) | |
cj = cookielib.CookieJar() | |
br = mechanize.Browser() | |
br.set_cookiejar(cj) | |
br.open(theURL) | |
br.select_form(nr=2) | |
un = raw_input('Email: ') | |
pw = getpass.getpass('Password: ') | |
br.form['ContestLogin1$RegistrationEmail'] = un | |
br.form['ContestLogin1$RegistrationPassword'] = str(pw) | |
br.submit() | |
for cap in caplist: | |
theURLc = ny_url(num=cap) | |
thePAGE = br.open(theURLc) | |
nySoup = bs4.BeautifulSoup(thePAGE, 'html.parser') | |
ela = nySoup.find_all('span', attrs={'id':'ContestArchive1_WinningCaption'}) | |
for el in ela: | |
ret.append(el.text) | |
print(el.text) | |
print('Done') | |
return ret | |
winners = get_ny_winner([i for i in range(300, 305)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment