alexpkeil1/scrape_newyorker.py

## scrape_newyorker.py
# coding: utf-8
# python-2.7
# new yorker caption contest
import requests
import bs4 # beautifulsoup
import mechanize
import cookielib
import getpass

def ny_url(num=500, pr=False):
	mystr = "%02d" % num
	theURL = "http://contest.newyorker.com/CaptionContest.aspx?tab=archive&id=" + str(num)
	if pr:
		print(theURL)
	return theURL


def get_ny_html(theURL):
	response = requests.get(theURL)
	return response.text


def get_ny_winner(caplist=[500,503]):
	'''
	Retrieve a list of the winning captions in the New Yorker caption contest
	Requires New Yorker login information
	'''
	ret=[]
	theURL = ny_url(num=caplist[0], pr=True)
	cj = cookielib.CookieJar()
	br = mechanize.Browser()
	br.set_cookiejar(cj)
	br.open(theURL)

	br.select_form(nr=2)
	un = raw_input('Email: ')
	pw = getpass.getpass('Password: ')
	br.form['ContestLogin1$RegistrationEmail'] = un
	br.form['ContestLogin1$RegistrationPassword'] =  str(pw)
	br.submit()
	for cap in caplist:
		theURLc = ny_url(num=cap)
		thePAGE = br.open(theURLc)
		nySoup = bs4.BeautifulSoup(thePAGE, 'html.parser')
		ela = nySoup.find_all('span', attrs={'id':'ContestArchive1_WinningCaption'})
		for el in ela:
			ret.append(el.text)
			print(el.text)
	print('Done')
	return ret


winners = get_ny_winner([i for i in range(300, 305)])
	# coding: utf-8
	# python-2.7
	# new yorker caption contest
	import requests
	import bs4 # beautifulsoup
	import mechanize
	import cookielib
	import getpass

	def ny_url(num=500, pr=False):
	mystr = "%02d" % num
	theURL = "http://contest.newyorker.com/CaptionContest.aspx?tab=archive&id=" + str(num)
	if pr:
	print(theURL)
	return theURL


	def get_ny_html(theURL):
	response = requests.get(theURL)
	return response.text


	def get_ny_winner(caplist=[500,503]):
	'''
	Retrieve a list of the winning captions in the New Yorker caption contest
	Requires New Yorker login information
	'''
	ret=[]
	theURL = ny_url(num=caplist[0], pr=True)
	cj = cookielib.CookieJar()
	br = mechanize.Browser()
	br.set_cookiejar(cj)
	br.open(theURL)

	br.select_form(nr=2)
	un = raw_input('Email: ')
	pw = getpass.getpass('Password: ')
	br.form['ContestLogin1$RegistrationEmail'] = un
	br.form['ContestLogin1$RegistrationPassword'] = str(pw)
	br.submit()
	for cap in caplist:
	theURLc = ny_url(num=cap)
	thePAGE = br.open(theURLc)
	nySoup = bs4.BeautifulSoup(thePAGE, 'html.parser')
	ela = nySoup.find_all('span', attrs={'id':'ContestArchive1_WinningCaption'})
	for el in ela:
	ret.append(el.text)
	print(el.text)
	print('Done')
	return ret


	winners = get_ny_winner([i for i in range(300, 305)])