Skip to content

Instantly share code, notes, and snippets.

@9b
Created October 28, 2014 19:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 9b/746d81fee524eb3b3ccb to your computer and use it in GitHub Desktop.
Save 9b/746d81fee524eb3b3ccb to your computer and use it in GitHub Desktop.
Parse out reviews from www.coffeereviews.com into a structured format.
import requests, sys, cPickle, os
from BeautifulSoup import BeautifulSoup
from pymongo import MongoClient
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36' }
def pickleIt(fname, data):
'''
Take the contents of a variable and save it to the filesystem for later use
@todo handle actual exceptions and consider tracking success
@returns None
'''
try:
with open(fname + ".pickle", "wb") as output_file:
cPickle.dump(data, output_file)
output_file.close()
except Exception,e:
print "Cannot open the file:", file_name, e
return
def extractPageReviews(content):
'''
Take the raw HTML content for a review page and extract all the reviews
@returns a list of dictionaries describing each review
'''
extracted = []
soup = BeautifulSoup(content)
reviews = soup.findAll('div', { 'class': 'review-content' })
for r in reviews:
output = {}
output['reviewLink'] = r.find('h2', { 'class': 'review-title'}).find('a')['href']
output['ratingNum'] = r.find('div', { 'class': 'review-rating' }).string
output['roaster'] = r.find('a').string
output['coffee'] = r.find('h2', { 'class': 'review-title'}).find('a').string
sideData = r.find('div', { 'class': 'review-col2' }).findAll('p')
output['reviewDate'] = sideData[0].text.split(':')[1]
output['priceEst'] = sideData[1].text.split(':')[1]
extracted.append(output)
return extracted
def extractSingleReview(content, fc):
'''
Take the raw HTML for each review and parse out the actual details.
@note Use the existing metadata to double check our current results
@returns an enriched dictionary for each review
'''
soup = BeautifulSoup(content)
cdata = soup.find('div', { 'class': 'review-col1'})
ratingNum = cdata.find('div', { 'class': 'review-rating'}).string
if ratingNum != fc['ratingNum']:
return fc # return the original because something doesn't match
# left column with the coffee details
cmeta = cdata.findAll('p')
fc['location'] = cmeta[0].text.split(':')[1]
fc['origin'] = cmeta[1].text.split(':')[1]
fc['roastStyle'] = cmeta[2].text.split(':')[1]
# right column with the rating details
rdata = soup.find('div', { 'class': 'review-col2'})
rmeta = rdata.findAll('p')
fc['scores'] = {}
for category in rmeta[1:]: # skip the first since it's a date
entry = category.text.split(':')
c = entry[0].lstrip().rstrip().lower()
fc['scores'][c] = entry[1].lstrip().rstrip()
# below for the main details
pelements = soup.findAll('p')
extracted = []
for ele in pelements:
if len(ele.text) > 85:
extracted.append(ele.text)
else:
# handles the edge case on the last review aspect
if ele.text.find('Who Should Drink') > -1: extracted.append(ele.text)
extracted = [ x.replace('Blind Assessment:', '').replace('Notes:', '').replace('Who Should Drink It:', '') for x in extracted ]
fc['review'] = {}
fc['review']['blindAssessment'] = extracted[0]
fc['review']['notes'] = extracted[1]
fc['review']['targetDrinker'] = extracted[2]
return fc
def collectLinks(depth):
'''
Take a crawl depth and use it to walk each page of reviews
@returns a list of dictionary reviews
'''
reviews = []
for i in range(0, depth):
base = 'http://www.coffeereview.com/review/page' + '/%d/' % (i)
response = requests.get(base, headers=headers)
reviews += extractPageReviews(response.content)
return reviews
def mongoConnect(host, port, database, collection):
return MongoClient(host, port)[database][collection]
def main():
depth = 81 # all of these have prices
if os.path.isfile('metaReviews.pickle'):
with open("metaReviews.pickle", "rb") as input_file:
reviews = cPickle.load(input_file)
else:
reviews = collectLinks(depth)
pickleIt('metaReviews', reviews)
db = mongoConnect('127.0.0.1', 27017, 'coffee', 'reviews')
for review in reviews:
response = requests.get(review['reviewLink'], headers=headers)
try:
obj = extractSingleReview(response.content, review)
db.insert(obj)
except:
print review
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment