kylegallatin/Scraping.py Secret

## Scraping.py
### Selenium file for scraping user reviews ###

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv

driver = webdriver.Chrome()
#open up the page
driver.get('https://www.beeradvocate.com/beer/profile/29/1524/')

#create the csv
csv_file = open('nattyLight.csv', 'wb')
writer = csv.writer(csv_file)
writer.writerow(['date','attributes', 'rDev', 'name'])

#to append to the original url
url = '?view=beer&sort=&start='

#find the number of reviews/last page
last = driver.find_element_by_xpath('//*[@id="item_stats"]/dl/dd[1]/span').text
last = int(last.replace(',', ''))

#name and brewery
name = driver.find_element_by_xpath('//*[@id="content"]/div/div/div[3]/div/div/div[1]/h1').text

index = 0
while index < last:
    try:
        print "Page" + str(index)
        driver.get('https://www.beeradvocate.com/beer/profile/29/1524/' + url + str(index))
        index = index + 25
        reviews = driver.find_elements_by_xpath('//div[@id="rating_fullview"]/div')
        for review in reviews:
                 rDict = {}
                 name = name
                 rDev = review.find_element_by_xpath('.//span[3]').text
                 attributes = review.find_element_by_xpath('.//span[@class="muted"]').text
                 date = review.find_element_by_xpath('.//div//span[@class="muted"]/a[2]').text


                 rDict['name'] = name
                 rDict['rDev'] = rDev
                 rDict['attributes'] = attributes
                 rDict['date'] = date
                 writer.writerow(rDict.values())
        time.sleep(2)
    except Exception as e:
        print e
        csv_file.close()
        driver.close()
        break


### Scrapy for tables on beer on Beeradvocate by style ###

#spider

# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy.selector import Selector
from demo.items import DemoItem

class DemoSpider(Spider):
	name = 'demo_spider'
	allowed_urls = ['beeradvocate.com']
	x = 'https://www.beeradvocate.com/beer/style/158/?sort=revsD&start='
	start_urls = [x + str(i) for i in range(0,4519,50)]


	def parse(self, response):
		table = response.xpath('//*[@id="ba-content"]/table/tr').extract()

		for i in table:
                        Name = Selector(text=i).xpath('//*/td[1]/a/b/text()').extract()
			Brewery = Selector(text=i).xpath('.//td[2]/a/text()').extract()
			ABV = Selector(text=i).xpath('.//td[3]/span/text()').extract()
			Avg = Selector(text=i).xpath('.//td[4]/b/text()').extract()
			Ratings = Selector(text=i).xpath('.//td[5]/b/text()').extract()
			Bros = Selector(text=i).xpath('.//td[6]/a/b/text()').extract()

			item = DemoItem()
			item['Name'] = Name
			item['Brewery'] = Brewery
			item['ABV'] = ABV
			item['Avg'] = Avg
			item['Ratings'] = Ratings
			item['Bros'] = Bros

			yield item

#pipeline

# -*- coding: utf-8 -*-
from scrapy.exporters import CsvItemExporter

class BeerPipeline(object):

        def __init__(self):
                self.filename = 'beer.csv'

        def open_spider(self, spider):
                self.csvfile = open(self.filename, 'wb')
                self.exporter = CsvItemExporter(self.csvfile)
                self.exporter.start_exporting()

        def close_spider(self, spider):
                self.exporter.finish_exporting()
                self.csvfile.close()

        def process_item(self, item, spider):
                self.exporter.export_item(item)
                return item

#items

# -*- coding: utf-8 -*-
from scrapy import Item, Field

class DemoItem(Item):
	Style = Field()
	Name = Field()
	Brewery = Field()
	ABV = Field()
	Avg = Field()
	Ratings = Field()
	Bros = Field()

#settings

# -*- coding: utf-8 -*-
BOT_NAME = 'demo'

SPIDER_MODULES = ['demo.spiders']
NEWSPIDER_MODULE = 'demo.spiders'

DOWNLOAD_DELAY = 3 #delay before changing URLs to avoid error

ITEM_PIPELINES = {'demo.pipelines.BeerPipeline': 100, }
	### Selenium file for scraping user reviews ###

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	import time
	import csv

	driver = webdriver.Chrome()
	#open up the page
	driver.get('https://www.beeradvocate.com/beer/profile/29/1524/')

	#create the csv
	csv_file = open('nattyLight.csv', 'wb')
	writer = csv.writer(csv_file)
	writer.writerow(['date','attributes', 'rDev', 'name'])

	#to append to the original url
	url = '?view=beer&sort=&start='

	#find the number of reviews/last page
	last = driver.find_element_by_xpath('//*[@id="item_stats"]/dl/dd[1]/span').text
	last = int(last.replace(',', ''))

	#name and brewery
	name = driver.find_element_by_xpath('//*[@id="content"]/div/div/div[3]/div/div/div[1]/h1').text

	index = 0
	while index < last:
	try:
	print "Page" + str(index)
	driver.get('https://www.beeradvocate.com/beer/profile/29/1524/' + url + str(index))
	index = index + 25
	reviews = driver.find_elements_by_xpath('//div[@id="rating_fullview"]/div')
	for review in reviews:
	rDict = {}
	name = name
	rDev = review.find_element_by_xpath('.//span[3]').text
	attributes = review.find_element_by_xpath('.//span[@class="muted"]').text
	date = review.find_element_by_xpath('.//div//span[@class="muted"]/a[2]').text


	rDict['name'] = name
	rDict['rDev'] = rDev
	rDict['attributes'] = attributes
	rDict['date'] = date
	writer.writerow(rDict.values())
	time.sleep(2)
	except Exception as e:
	print e
	csv_file.close()
	driver.close()
	break


	### Scrapy for tables on beer on Beeradvocate by style ###

	#spider

	# -- coding: utf-8 --
	from scrapy import Spider
	from scrapy.selector import Selector
	from demo.items import DemoItem

	class DemoSpider(Spider):
	name = 'demo_spider'
	allowed_urls = ['beeradvocate.com']
	x = 'https://www.beeradvocate.com/beer/style/158/?sort=revsD&start='
	start_urls = [x + str(i) for i in range(0,4519,50)]


	def parse(self, response):
	table = response.xpath('//*[@id="ba-content"]/table/tr').extract()

	for i in table:
	Name = Selector(text=i).xpath('//*/td[1]/a/b/text()').extract()
	Brewery = Selector(text=i).xpath('.//td[2]/a/text()').extract()
	ABV = Selector(text=i).xpath('.//td[3]/span/text()').extract()
	Avg = Selector(text=i).xpath('.//td[4]/b/text()').extract()
	Ratings = Selector(text=i).xpath('.//td[5]/b/text()').extract()
	Bros = Selector(text=i).xpath('.//td[6]/a/b/text()').extract()

	item = DemoItem()
	item['Name'] = Name
	item['Brewery'] = Brewery
	item['ABV'] = ABV
	item['Avg'] = Avg
	item['Ratings'] = Ratings
	item['Bros'] = Bros

	yield item

	#pipeline

	# -- coding: utf-8 --
	from scrapy.exporters import CsvItemExporter

	class BeerPipeline(object):

	def __init__(self):
	self.filename = 'beer.csv'

	def open_spider(self, spider):
	self.csvfile = open(self.filename, 'wb')
	self.exporter = CsvItemExporter(self.csvfile)
	self.exporter.start_exporting()

	def close_spider(self, spider):
	self.exporter.finish_exporting()
	self.csvfile.close()

	def process_item(self, item, spider):
	self.exporter.export_item(item)
	return item

	#items

	# -- coding: utf-8 --
	from scrapy import Item, Field

	class DemoItem(Item):
	Style = Field()
	Name = Field()
	Brewery = Field()
	ABV = Field()
	Avg = Field()
	Ratings = Field()
	Bros = Field()

	#settings

	# -- coding: utf-8 --
	BOT_NAME = 'demo'

	SPIDER_MODULES = ['demo.spiders']
	NEWSPIDER_MODULE = 'demo.spiders'

	DOWNLOAD_DELAY = 3 #delay before changing URLs to avoid error

	ITEM_PIPELINES = {'demo.pipelines.BeerPipeline': 100, }