9b/coffeeReviews.py

## coffeeReviews.py
import requests, sys, cPickle, os
from BeautifulSoup import BeautifulSoup
from pymongo import MongoClient

headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36' }

def pickleIt(fname, data):
	'''
	Take the contents of a variable and save it to the filesystem for later use
	@todo	handle actual exceptions and consider tracking success
	@returns	None
	'''
	try:
		with open(fname + ".pickle", "wb") as output_file:
			cPickle.dump(data, output_file)
		output_file.close()
	except Exception,e:
		print "Cannot open the file:", file_name, e
	return

def extractPageReviews(content):
	'''
	Take the raw HTML content for a review page and extract all the reviews
	@returns	a list of dictionaries describing each review
	'''
	extracted = []
	soup = BeautifulSoup(content)
	reviews = soup.findAll('div', { 'class': 'review-content' })
	for r in reviews:
		output = {}
		output['reviewLink'] = r.find('h2', { 'class': 'review-title'}).find('a')['href']
		output['ratingNum'] = r.find('div', { 'class': 'review-rating' }).string
		output['roaster'] = r.find('a').string
		output['coffee'] = r.find('h2', { 'class': 'review-title'}).find('a').string
		sideData = r.find('div', { 'class': 'review-col2' }).findAll('p')
		output['reviewDate'] = sideData[0].text.split(':')[1]
		output['priceEst'] = sideData[1].text.split(':')[1]
		extracted.append(output)
	return extracted

def extractSingleReview(content, fc):
	'''
	Take the raw HTML for each review and parse out the actual details.
	@note	Use the existing metadata to double check our current results
	@returns	an enriched dictionary for each review
	'''
	soup = BeautifulSoup(content)
	cdata = soup.find('div', { 'class': 'review-col1'})
	ratingNum = cdata.find('div', { 'class': 'review-rating'}).string
	if ratingNum != fc['ratingNum']:
		return fc # return the original because something doesn't match

	# left column with the coffee details
	cmeta = cdata.findAll('p')
	fc['location'] = cmeta[0].text.split(':')[1]
	fc['origin'] = cmeta[1].text.split(':')[1]
	fc['roastStyle'] = cmeta[2].text.split(':')[1]

	# right column with the rating details
	rdata = soup.find('div', { 'class': 'review-col2'})
	rmeta = rdata.findAll('p')
	fc['scores'] = {}
	for category in rmeta[1:]: # skip the first since it's a date
		entry = category.text.split(':')
		c = entry[0].lstrip().rstrip().lower()
		fc['scores'][c] = entry[1].lstrip().rstrip()

	# below for the main details
	pelements = soup.findAll('p')
	extracted = []
	for ele in pelements:
		if len(ele.text) > 85:
			extracted.append(ele.text)
		else:
			# handles the edge case on the last review aspect
			if ele.text.find('Who Should Drink') > -1: extracted.append(ele.text)
	extracted = [ x.replace('Blind Assessment:', '').replace('Notes:', '').replace('Who Should Drink It:', '') for x in extracted ]
	fc['review'] = {}
	fc['review']['blindAssessment'] = extracted[0]
	fc['review']['notes'] = extracted[1]
	fc['review']['targetDrinker'] = extracted[2]

	return fc

def collectLinks(depth):
	'''
	Take a crawl depth and use it to walk each page of reviews
	@returns	a list of dictionary reviews
	'''
	reviews = []
	for i in range(0, depth):
		base = 'http://www.coffeereview.com/review/page' + '/%d/' % (i)
		response = requests.get(base, headers=headers)
		reviews += extractPageReviews(response.content)
	return reviews

def mongoConnect(host, port, database, collection):
	return MongoClient(host, port)[database][collection]

def main():

	depth = 81 # all of these have prices

	if os.path.isfile('metaReviews.pickle'):
		with open("metaReviews.pickle", "rb") as input_file:
			reviews = cPickle.load(input_file)
	else:
		reviews = collectLinks(depth)
		pickleIt('metaReviews', reviews)

	db = mongoConnect('127.0.0.1', 27017, 'coffee', 'reviews')

	for review in reviews:
		response = requests.get(review['reviewLink'], headers=headers)
		try:
			obj = extractSingleReview(response.content, review)
			db.insert(obj)
		except:
			print review

if __name__ == '__main__':
	main()
	import requests, sys, cPickle, os
	from BeautifulSoup import BeautifulSoup
	from pymongo import MongoClient

	headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36' }

	def pickleIt(fname, data):
	'''
	Take the contents of a variable and save it to the filesystem for later use
	@todo handle actual exceptions and consider tracking success
	@returns None
	'''
	try:
	with open(fname + ".pickle", "wb") as output_file:
	cPickle.dump(data, output_file)
	output_file.close()
	except Exception,e:
	print "Cannot open the file:", file_name, e
	return

	def extractPageReviews(content):
	'''
	Take the raw HTML content for a review page and extract all the reviews
	@returns a list of dictionaries describing each review
	'''
	extracted = []
	soup = BeautifulSoup(content)
	reviews = soup.findAll('div', { 'class': 'review-content' })
	for r in reviews:
	output = {}
	output['reviewLink'] = r.find('h2', { 'class': 'review-title'}).find('a')['href']
	output['ratingNum'] = r.find('div', { 'class': 'review-rating' }).string
	output['roaster'] = r.find('a').string
	output['coffee'] = r.find('h2', { 'class': 'review-title'}).find('a').string
	sideData = r.find('div', { 'class': 'review-col2' }).findAll('p')
	output['reviewDate'] = sideData[0].text.split(':')[1]
	output['priceEst'] = sideData[1].text.split(':')[1]
	extracted.append(output)
	return extracted

	def extractSingleReview(content, fc):
	'''
	Take the raw HTML for each review and parse out the actual details.
	@note Use the existing metadata to double check our current results
	@returns an enriched dictionary for each review
	'''
	soup = BeautifulSoup(content)
	cdata = soup.find('div', { 'class': 'review-col1'})
	ratingNum = cdata.find('div', { 'class': 'review-rating'}).string
	if ratingNum != fc['ratingNum']:
	return fc # return the original because something doesn't match

	# left column with the coffee details
	cmeta = cdata.findAll('p')
	fc['location'] = cmeta[0].text.split(':')[1]
	fc['origin'] = cmeta[1].text.split(':')[1]
	fc['roastStyle'] = cmeta[2].text.split(':')[1]

	# right column with the rating details
	rdata = soup.find('div', { 'class': 'review-col2'})
	rmeta = rdata.findAll('p')
	fc['scores'] = {}
	for category in rmeta[1:]: # skip the first since it's a date
	entry = category.text.split(':')
	c = entry[0].lstrip().rstrip().lower()
	fc['scores'][c] = entry[1].lstrip().rstrip()

	# below for the main details
	pelements = soup.findAll('p')
	extracted = []
	for ele in pelements:
	if len(ele.text) > 85:
	extracted.append(ele.text)
	else:
	# handles the edge case on the last review aspect
	if ele.text.find('Who Should Drink') > -1: extracted.append(ele.text)
	extracted = [ x.replace('Blind Assessment:', '').replace('Notes:', '').replace('Who Should Drink It:', '') for x in extracted ]
	fc['review'] = {}
	fc['review']['blindAssessment'] = extracted[0]
	fc['review']['notes'] = extracted[1]
	fc['review']['targetDrinker'] = extracted[2]

	return fc

	def collectLinks(depth):
	'''
	Take a crawl depth and use it to walk each page of reviews
	@returns a list of dictionary reviews
	'''
	reviews = []
	for i in range(0, depth):
	base = 'http://www.coffeereview.com/review/page' + '/%d/' % (i)
	response = requests.get(base, headers=headers)
	reviews += extractPageReviews(response.content)
	return reviews

	def mongoConnect(host, port, database, collection):
	return MongoClient(host, port)[database][collection]

	def main():

	depth = 81 # all of these have prices

	if os.path.isfile('metaReviews.pickle'):
	with open("metaReviews.pickle", "rb") as input_file:
	reviews = cPickle.load(input_file)
	else:
	reviews = collectLinks(depth)
	pickleIt('metaReviews', reviews)

	db = mongoConnect('127.0.0.1', 27017, 'coffee', 'reviews')

	for review in reviews:
	response = requests.get(review['reviewLink'], headers=headers)
	try:
	obj = extractSingleReview(response.content, review)
	db.insert(obj)
	except:
	print review

	if __name__ == '__main__':
	main()