-
-
Save kylegallatin/77646b75ed34c43c5dca6e9e64c81a11 to your computer and use it in GitHub Desktop.
Python scraping for beer advocate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Selenium file for scraping user reviews ### | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
import time | |
import csv | |
driver = webdriver.Chrome() | |
#open up the page | |
driver.get('https://www.beeradvocate.com/beer/profile/29/1524/') | |
#create the csv | |
csv_file = open('nattyLight.csv', 'wb') | |
writer = csv.writer(csv_file) | |
writer.writerow(['date','attributes', 'rDev', 'name']) | |
#to append to the original url | |
url = '?view=beer&sort=&start=' | |
#find the number of reviews/last page | |
last = driver.find_element_by_xpath('//*[@id="item_stats"]/dl/dd[1]/span').text | |
last = int(last.replace(',', '')) | |
#name and brewery | |
name = driver.find_element_by_xpath('//*[@id="content"]/div/div/div[3]/div/div/div[1]/h1').text | |
index = 0 | |
while index < last: | |
try: | |
print "Page" + str(index) | |
driver.get('https://www.beeradvocate.com/beer/profile/29/1524/' + url + str(index)) | |
index = index + 25 | |
reviews = driver.find_elements_by_xpath('//div[@id="rating_fullview"]/div') | |
for review in reviews: | |
rDict = {} | |
name = name | |
rDev = review.find_element_by_xpath('.//span[3]').text | |
attributes = review.find_element_by_xpath('.//span[@class="muted"]').text | |
date = review.find_element_by_xpath('.//div//span[@class="muted"]/a[2]').text | |
rDict['name'] = name | |
rDict['rDev'] = rDev | |
rDict['attributes'] = attributes | |
rDict['date'] = date | |
writer.writerow(rDict.values()) | |
time.sleep(2) | |
except Exception as e: | |
print e | |
csv_file.close() | |
driver.close() | |
break | |
### Scrapy for tables on beer on Beeradvocate by style ### | |
#spider | |
# -*- coding: utf-8 -*- | |
from scrapy import Spider | |
from scrapy.selector import Selector | |
from demo.items import DemoItem | |
class DemoSpider(Spider): | |
name = 'demo_spider' | |
allowed_urls = ['beeradvocate.com'] | |
x = 'https://www.beeradvocate.com/beer/style/158/?sort=revsD&start=' | |
start_urls = [x + str(i) for i in range(0,4519,50)] | |
def parse(self, response): | |
table = response.xpath('//*[@id="ba-content"]/table/tr').extract() | |
for i in table: | |
Name = Selector(text=i).xpath('//*/td[1]/a/b/text()').extract() | |
Brewery = Selector(text=i).xpath('.//td[2]/a/text()').extract() | |
ABV = Selector(text=i).xpath('.//td[3]/span/text()').extract() | |
Avg = Selector(text=i).xpath('.//td[4]/b/text()').extract() | |
Ratings = Selector(text=i).xpath('.//td[5]/b/text()').extract() | |
Bros = Selector(text=i).xpath('.//td[6]/a/b/text()').extract() | |
item = DemoItem() | |
item['Name'] = Name | |
item['Brewery'] = Brewery | |
item['ABV'] = ABV | |
item['Avg'] = Avg | |
item['Ratings'] = Ratings | |
item['Bros'] = Bros | |
yield item | |
#pipeline | |
# -*- coding: utf-8 -*- | |
from scrapy.exporters import CsvItemExporter | |
class BeerPipeline(object): | |
def __init__(self): | |
self.filename = 'beer.csv' | |
def open_spider(self, spider): | |
self.csvfile = open(self.filename, 'wb') | |
self.exporter = CsvItemExporter(self.csvfile) | |
self.exporter.start_exporting() | |
def close_spider(self, spider): | |
self.exporter.finish_exporting() | |
self.csvfile.close() | |
def process_item(self, item, spider): | |
self.exporter.export_item(item) | |
return item | |
#items | |
# -*- coding: utf-8 -*- | |
from scrapy import Item, Field | |
class DemoItem(Item): | |
Style = Field() | |
Name = Field() | |
Brewery = Field() | |
ABV = Field() | |
Avg = Field() | |
Ratings = Field() | |
Bros = Field() | |
#settings | |
# -*- coding: utf-8 -*- | |
BOT_NAME = 'demo' | |
SPIDER_MODULES = ['demo.spiders'] | |
NEWSPIDER_MODULE = 'demo.spiders' | |
DOWNLOAD_DELAY = 3 #delay before changing URLs to avoid error | |
ITEM_PIPELINES = {'demo.pipelines.BeerPipeline': 100, } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment