Skip to content

Instantly share code, notes, and snippets.

@kylegallatin
Created February 19, 2017 20:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kylegallatin/77646b75ed34c43c5dca6e9e64c81a11 to your computer and use it in GitHub Desktop.
Save kylegallatin/77646b75ed34c43c5dca6e9e64c81a11 to your computer and use it in GitHub Desktop.
Python scraping for beer advocate
### Selenium file for scraping user reviews ###
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
driver = webdriver.Chrome()
#open up the page
driver.get('https://www.beeradvocate.com/beer/profile/29/1524/')
#create the csv
csv_file = open('nattyLight.csv', 'wb')
writer = csv.writer(csv_file)
writer.writerow(['date','attributes', 'rDev', 'name'])
#to append to the original url
url = '?view=beer&sort=&start='
#find the number of reviews/last page
last = driver.find_element_by_xpath('//*[@id="item_stats"]/dl/dd[1]/span').text
last = int(last.replace(',', ''))
#name and brewery
name = driver.find_element_by_xpath('//*[@id="content"]/div/div/div[3]/div/div/div[1]/h1').text
index = 0
while index < last:
try:
print "Page" + str(index)
driver.get('https://www.beeradvocate.com/beer/profile/29/1524/' + url + str(index))
index = index + 25
reviews = driver.find_elements_by_xpath('//div[@id="rating_fullview"]/div')
for review in reviews:
rDict = {}
name = name
rDev = review.find_element_by_xpath('.//span[3]').text
attributes = review.find_element_by_xpath('.//span[@class="muted"]').text
date = review.find_element_by_xpath('.//div//span[@class="muted"]/a[2]').text
rDict['name'] = name
rDict['rDev'] = rDev
rDict['attributes'] = attributes
rDict['date'] = date
writer.writerow(rDict.values())
time.sleep(2)
except Exception as e:
print e
csv_file.close()
driver.close()
break
### Scrapy for tables on beer on Beeradvocate by style ###
#spider
# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy.selector import Selector
from demo.items import DemoItem
class DemoSpider(Spider):
name = 'demo_spider'
allowed_urls = ['beeradvocate.com']
x = 'https://www.beeradvocate.com/beer/style/158/?sort=revsD&start='
start_urls = [x + str(i) for i in range(0,4519,50)]
def parse(self, response):
table = response.xpath('//*[@id="ba-content"]/table/tr').extract()
for i in table:
Name = Selector(text=i).xpath('//*/td[1]/a/b/text()').extract()
Brewery = Selector(text=i).xpath('.//td[2]/a/text()').extract()
ABV = Selector(text=i).xpath('.//td[3]/span/text()').extract()
Avg = Selector(text=i).xpath('.//td[4]/b/text()').extract()
Ratings = Selector(text=i).xpath('.//td[5]/b/text()').extract()
Bros = Selector(text=i).xpath('.//td[6]/a/b/text()').extract()
item = DemoItem()
item['Name'] = Name
item['Brewery'] = Brewery
item['ABV'] = ABV
item['Avg'] = Avg
item['Ratings'] = Ratings
item['Bros'] = Bros
yield item
#pipeline
# -*- coding: utf-8 -*-
from scrapy.exporters import CsvItemExporter
class BeerPipeline(object):
def __init__(self):
self.filename = 'beer.csv'
def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
#items
# -*- coding: utf-8 -*-
from scrapy import Item, Field
class DemoItem(Item):
Style = Field()
Name = Field()
Brewery = Field()
ABV = Field()
Avg = Field()
Ratings = Field()
Bros = Field()
#settings
# -*- coding: utf-8 -*-
BOT_NAME = 'demo'
SPIDER_MODULES = ['demo.spiders']
NEWSPIDER_MODULE = 'demo.spiders'
DOWNLOAD_DELAY = 3 #delay before changing URLs to avoid error
ITEM_PIPELINES = {'demo.pipelines.BeerPipeline': 100, }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment