Created
October 28, 2014 19:19
-
-
Save 9b/746d81fee524eb3b3ccb to your computer and use it in GitHub Desktop.
Parse out reviews from www.coffeereviews.com into a structured format.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests, sys, cPickle, os | |
from BeautifulSoup import BeautifulSoup | |
from pymongo import MongoClient | |
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36' } | |
def pickleIt(fname, data): | |
''' | |
Take the contents of a variable and save it to the filesystem for later use | |
@todo handle actual exceptions and consider tracking success | |
@returns None | |
''' | |
try: | |
with open(fname + ".pickle", "wb") as output_file: | |
cPickle.dump(data, output_file) | |
output_file.close() | |
except Exception,e: | |
print "Cannot open the file:", file_name, e | |
return | |
def extractPageReviews(content): | |
''' | |
Take the raw HTML content for a review page and extract all the reviews | |
@returns a list of dictionaries describing each review | |
''' | |
extracted = [] | |
soup = BeautifulSoup(content) | |
reviews = soup.findAll('div', { 'class': 'review-content' }) | |
for r in reviews: | |
output = {} | |
output['reviewLink'] = r.find('h2', { 'class': 'review-title'}).find('a')['href'] | |
output['ratingNum'] = r.find('div', { 'class': 'review-rating' }).string | |
output['roaster'] = r.find('a').string | |
output['coffee'] = r.find('h2', { 'class': 'review-title'}).find('a').string | |
sideData = r.find('div', { 'class': 'review-col2' }).findAll('p') | |
output['reviewDate'] = sideData[0].text.split(':')[1] | |
output['priceEst'] = sideData[1].text.split(':')[1] | |
extracted.append(output) | |
return extracted | |
def extractSingleReview(content, fc): | |
''' | |
Take the raw HTML for each review and parse out the actual details. | |
@note Use the existing metadata to double check our current results | |
@returns an enriched dictionary for each review | |
''' | |
soup = BeautifulSoup(content) | |
cdata = soup.find('div', { 'class': 'review-col1'}) | |
ratingNum = cdata.find('div', { 'class': 'review-rating'}).string | |
if ratingNum != fc['ratingNum']: | |
return fc # return the original because something doesn't match | |
# left column with the coffee details | |
cmeta = cdata.findAll('p') | |
fc['location'] = cmeta[0].text.split(':')[1] | |
fc['origin'] = cmeta[1].text.split(':')[1] | |
fc['roastStyle'] = cmeta[2].text.split(':')[1] | |
# right column with the rating details | |
rdata = soup.find('div', { 'class': 'review-col2'}) | |
rmeta = rdata.findAll('p') | |
fc['scores'] = {} | |
for category in rmeta[1:]: # skip the first since it's a date | |
entry = category.text.split(':') | |
c = entry[0].lstrip().rstrip().lower() | |
fc['scores'][c] = entry[1].lstrip().rstrip() | |
# below for the main details | |
pelements = soup.findAll('p') | |
extracted = [] | |
for ele in pelements: | |
if len(ele.text) > 85: | |
extracted.append(ele.text) | |
else: | |
# handles the edge case on the last review aspect | |
if ele.text.find('Who Should Drink') > -1: extracted.append(ele.text) | |
extracted = [ x.replace('Blind Assessment:', '').replace('Notes:', '').replace('Who Should Drink It:', '') for x in extracted ] | |
fc['review'] = {} | |
fc['review']['blindAssessment'] = extracted[0] | |
fc['review']['notes'] = extracted[1] | |
fc['review']['targetDrinker'] = extracted[2] | |
return fc | |
def collectLinks(depth): | |
''' | |
Take a crawl depth and use it to walk each page of reviews | |
@returns a list of dictionary reviews | |
''' | |
reviews = [] | |
for i in range(0, depth): | |
base = 'http://www.coffeereview.com/review/page' + '/%d/' % (i) | |
response = requests.get(base, headers=headers) | |
reviews += extractPageReviews(response.content) | |
return reviews | |
def mongoConnect(host, port, database, collection): | |
return MongoClient(host, port)[database][collection] | |
def main(): | |
depth = 81 # all of these have prices | |
if os.path.isfile('metaReviews.pickle'): | |
with open("metaReviews.pickle", "rb") as input_file: | |
reviews = cPickle.load(input_file) | |
else: | |
reviews = collectLinks(depth) | |
pickleIt('metaReviews', reviews) | |
db = mongoConnect('127.0.0.1', 27017, 'coffee', 'reviews') | |
for review in reviews: | |
response = requests.get(review['reviewLink'], headers=headers) | |
try: | |
obj = extractSingleReview(response.content, review) | |
db.insert(obj) | |
except: | |
print review | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment