Skip to content

Instantly share code, notes, and snippets.

@cosmocatalano
Last active December 14, 2015 02:19
Show Gist options
  • Save cosmocatalano/5013048 to your computer and use it in GitHub Desktop.
Save cosmocatalano/5013048 to your computer and use it in GitHub Desktop.
Quick-and-dirty web-to-JSON scrape to get info on your most recent beer. Should be readily adaptable.
#!/usr/bin/python
from bs4 import BeautifulSoup #you will probably have to install this: http://www.crummy.com/software/BeautifulSoup/
import urllib2
import sys
import cgitb
import string
import json
#This takes a URL and turns it into BeautifulSoup object
def make_soup(url):
try:
source = urllib2.urlopen(url).read()
soup = BeautifulSoup(source)
return soup
except:
print 'couldn\'t connect to source'
sys.exit()
cgitb.enable(format='txt') #error reporting on, in text
print 'Content-Type: text/plain\n' #specifies text, adds required after header info
#Starting the connection
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
#Time-saving variables
username = 'cosmocatalano' #you'll probably want to change this
u_url = 'http://untappd.com'
u_user = 'http://untappd.com/user/' + username
#Grabbing the data
response = opener.open(u_user)
search_soup = make_soup(u_user)
result = search_soup.find('div', 'details') #not .find_all because I'm just looking for the latest beer
#All the links in the most recent <div class="details">; a very close match for API data.
API_bits = result.find_all('a')
#Timestamp
timestamp = result.find('li', class_='timezoner')
#Getting the main page of the last beer checked-in
beer_page = make_soup(u_url + API_bits[4].get('href'))
#Getting the latest image, if one exists
try:
beer_image = beer_page.find('div', class_='photo')
image_url = beer_image.a.img['src']
except:
beer_image = beer_page.find('span', class_='icon')
image_url = beer_image.a.img['src']
#Getting the rating of the beer through some belabored contortions
rating_span = beer_page.find('span', class_='rating')
rating_classes = rating_span['class']
#A dictionary to map class to score
score_value = {'r05': 0.5,
'r10': 1,
'r15': 1.5,
'r20': 2,
'r25': 2.5,
'r30': 3,
'r35': 3.5,
'r40': 4,
'r45': 4.5,
'r50': 5 }
#This is a touch janky because it relies on the class indicating the rating to always be the third listed
my_score = score_value[rating_classes[2]]
#Let's give our links some names
count_to_name = ('user',
'beer',
'brewer',
'location',
'checkin',
'extra') #another janky move, safety for when there are six <a> tags in the <details> div
#This dictionary will eventually become our API object
scrape_obj = {'timestamp': timestamp.contents[0],
'image' : image_url,
'score' : my_score}
#Setting up a loop to deal with all our links from <div class="details">
count = 0;
for bit in API_bits:
links = [u_url + bit['href'], bit.contents[0]]
scrape_obj[count_to_name[count]] = links
count = count + 1
#Turning it into a JSON object for you to use as you see fit.
print json.dumps(scrape_obj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment