cosmocatalano/untappd_scrape.py

## untappd_scrape.py
#!/usr/bin/python

from bs4 import BeautifulSoup             #you will probably have to install this: http://www.crummy.com/software/BeautifulSoup/
import urllib2
import sys
import cgitb
import string
import json

#This takes a URL and turns it into BeautifulSoup object
def make_soup(url):
	try:
		source = urllib2.urlopen(url).read()
		soup = BeautifulSoup(source)
		return soup
	except:
		print 'couldn\'t connect to source'
		sys.exit()

cgitb.enable(format='txt')  							#error reporting on, in text
print 'Content-Type: text/plain\n' 						#specifies text, adds required after header info

#Starting the connection
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]

#Time-saving variables
username = 'cosmocatalano' 							#you'll probably want to change this
u_url = 'http://untappd.com'
u_user = 'http://untappd.com/user/' + username

#Grabbing the data
response = opener.open(u_user)
search_soup = make_soup(u_user)
result = search_soup.find('div', 'details')					#not .find_all because I'm just looking for the latest beer

#All the links in the most recent <div class="details">; a very close match for API data.
API_bits = result.find_all('a')

#Timestamp
timestamp = result.find('li', class_='timezoner')

#Getting the main page of the last beer checked-in
beer_page = make_soup(u_url + API_bits[4].get('href'))

#Getting the latest image, if one exists
try:
	beer_image = beer_page.find('div', class_='photo')
	image_url = beer_image.a.img['src']
except:
	beer_image = beer_page.find('span', class_='icon')
	image_url = beer_image.a.img['src']

#Getting the rating of the beer through some belabored contortions
rating_span = beer_page.find('span', class_='rating')
rating_classes = rating_span['class']

#A dictionary to map class to score
score_value = {'r05': 0.5,
	       'r10': 1,
	       'r15': 1.5,
	       'r20': 2,
	       'r25': 2.5,
	       'r30': 3,
	       'r35': 3.5,
	       'r40': 4,
	       'r45': 4.5,
	       'r50': 5 }

#This is a touch janky because it relies on the class indicating the rating to always be the third listed
my_score = score_value[rating_classes[2]]

#Let's give our links some names
count_to_name = ('user',
		'beer',
		'brewer',
		'location',
		'checkin',
		'extra')           #another janky move, safety for when there are six <a> tags in the <details> div

#This dictionary will eventually become our API object
scrape_obj = {'timestamp': timestamp.contents[0],
	      'image' : image_url,
	      'score' : my_score}

#Setting up a loop to deal with all our links from <div class="details">
count = 0;
for bit in API_bits:
	links = [u_url + bit['href'], bit.contents[0]]
	scrape_obj[count_to_name[count]] = links
	count = count + 1

#Turning it into a JSON object for you to use as you see fit.
print json.dumps(scrape_obj)
	#!/usr/bin/python

	from bs4 import BeautifulSoup #you will probably have to install this: http://www.crummy.com/software/BeautifulSoup/
	import urllib2
	import sys
	import cgitb
	import string
	import json

	#This takes a URL and turns it into BeautifulSoup object
	def make_soup(url):
	try:
	source = urllib2.urlopen(url).read()
	soup = BeautifulSoup(source)
	return soup
	except:
	print 'couldn\'t connect to source'
	sys.exit()

	cgitb.enable(format='txt') #error reporting on, in text
	print 'Content-Type: text/plain\n' #specifies text, adds required after header info

	#Starting the connection
	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Mozilla/5.0')]

	#Time-saving variables
	username = 'cosmocatalano' #you'll probably want to change this
	u_url = 'http://untappd.com'
	u_user = 'http://untappd.com/user/' + username

	#Grabbing the data
	response = opener.open(u_user)
	search_soup = make_soup(u_user)
	result = search_soup.find('div', 'details') #not .find_all because I'm just looking for the latest beer

	#All the links in the most recent <div class="details">; a very close match for API data.
	API_bits = result.find_all('a')

	#Timestamp
	timestamp = result.find('li', class_='timezoner')

	#Getting the main page of the last beer checked-in
	beer_page = make_soup(u_url + API_bits[4].get('href'))

	#Getting the latest image, if one exists
	try:
	beer_image = beer_page.find('div', class_='photo')
	image_url = beer_image.a.img['src']
	except:
	beer_image = beer_page.find('span', class_='icon')
	image_url = beer_image.a.img['src']

	#Getting the rating of the beer through some belabored contortions
	rating_span = beer_page.find('span', class_='rating')
	rating_classes = rating_span['class']

	#A dictionary to map class to score
	score_value = {'r05': 0.5,
	'r10': 1,
	'r15': 1.5,
	'r20': 2,
	'r25': 2.5,
	'r30': 3,
	'r35': 3.5,
	'r40': 4,
	'r45': 4.5,
	'r50': 5 }

	#This is a touch janky because it relies on the class indicating the rating to always be the third listed
	my_score = score_value[rating_classes[2]]

	#Let's give our links some names
	count_to_name = ('user',
	'beer',
	'brewer',
	'location',
	'checkin',
	'extra') #another janky move, safety for when there are six <a> tags in the <details> div

	#This dictionary will eventually become our API object
	scrape_obj = {'timestamp': timestamp.contents[0],
	'image' : image_url,
	'score' : my_score}

	#Setting up a loop to deal with all our links from <div class="details">
	count = 0;
	for bit in API_bits:
	links = [u_url + bit['href'], bit.contents[0]]
	scrape_obj[count_to_name[count]] = links
	count = count + 1

	#Turning it into a JSON object for you to use as you see fit.
	print json.dumps(scrape_obj)