Skip to content

Instantly share code, notes, and snippets.

@fouadyared
Last active June 28, 2017 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fouadyared/3538e825770bc81dc21b3124bebe4205 to your computer and use it in GitHub Desktop.
Save fouadyared/3538e825770bc81dc21b3124bebe4205 to your computer and use it in GitHub Desktop.
Files for "Harmful ingredients and added sugars" Web Scraping Project (by Fouad Yared)
import scrapy
class FoodItem(scrapy.Item):
food_categories_text = scrapy.Field()
food_rating = scrapy.Field()
food_name = scrapy.Field()
food_category_text = scrapy.Field()
food_subcategory_text = scrapy.Field()
all_comments = scrapy.Field()
pos_neg_comment = scrapy.Field()
ingredients = scrapy.Field()
certified_organic = scrapy.Field()
allergens = scrapy.Field()
serving_amount = scrapy.Field()
serving_unit = scrapy.Field()
calories = scrapy.Field()
nutri_fat_perc = scrapy.Field()
nutri_fat_num = scrapy.Field()
nutri_carbs_perc = scrapy.Field()
nutri_carbs_num = scrapy.Field()
nutri_sugar_num = scrapy.Field()
nutri_protein_perc = scrapy.Field()
nutri_protein_num = scrapy.Field()
nutri_perc_nutriName = scrapy.Field()
nutri_addedSugars = scrapy.Field()
Total_Fat = scrapy.Field()
Total_Carbs = scrapy.Field()
Sugars = scrapy.Field()
Protein = scrapy.Field()
Saturated_Fat = scrapy.Field()
Cholesterol = scrapy.Field()
Sodium = scrapy.Field()
Added_Sugar_Ingredients = scrapy.Field()
Dietary_Fiber = scrapy.Field()
Vitamin_A = scrapy.Field()
Vitamin_C = scrapy.Field()
Calcium = scrapy.Field()
Iron = scrapy.Field()
Potassium = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class FoodSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
from scrapy.exporters import CsvItemExporter
class FoodPipeline(object):
def process_item(self, item, spider):
return item
class WriteItemPipeline(object):
def __init__(self):
self.filename = 'foodOne.csv'
def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
# -*- coding: utf-8 -*-
# Scrapy settings for food project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'food'
SPIDER_MODULES = ['food.spiders']
NEWSPIDER_MODULE = 'food.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'food (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'food.middlewares.FoodSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'food.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'food.pipelines.FoodPipeline': 300,
#}
ITEM_PIPELINES = {'food.pipelines.WriteItemPipeline': 200}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
from food.items import FoodItem
import scrapy
class food_spider(scrapy.Spider):
name = 'food'
allowed_urls = ['http://www.ewg.org/']
start_urls = ['http://www.ewg.org/foodscores']
def verify(self, content):
if isinstance(content, list):
#checks to see if content is a list
if len(content) > 0:
content = content[0]
# convert unicode to str
return content.encode('ascii', 'ignore')
else:
return ""
elif (content is None):
# convert unicode to str
return "test"
else:
return content.encode('ascii', 'ignore')
def parse(self, response):
baby_food_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[0]
candy_food_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[4]
eggs_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[11]
tofu_meatAlt_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[22]
# not all the urls are in the same place.
# this joins the four urls that are separate categories with all the others
subcategories_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/ul/li/a/@href').extract()
list1 = [baby_food_url, candy_food_url, eggs_url, tofu_meatAlt_url]
url_list = list1 + subcategories_url
import itertools
pageurl = ['http://www.ewg.org' + l for l in url_list]
for item_url in pageurl:
for i in range(1, 100):
try:
url_to_parse = item_url + '&page={}&per_page=12&type=products'.format(i)
yield scrapy.Request(url_to_parse, callback=self.parse_top)
except:
print("Doesn't work...")
break
def parse_top(self, response):
food_categories_text = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/text()').extract()
food_id = response.xpath('//div[@class="ind_result_text fleft"]/a/@href').extract()
food_links = ['http://www.ewg.org' + id for id in food_id]
for link in food_links:
yield scrapy.Request(link, callback=self.parse_each, meta={'food_categories_text':food_categories_text})
def parse_each(self, response):
# variable set 1/3
food_categories_text = response.meta['food_categories_text']
food_categories_text = self.verify(food_categories_text)
# url_to_parse = response.meta['url_to_parse']
# url_to_parse = self.verify(url_to_parse)
food_rating = response.xpath('//div[@class="updated_score fleft"]/img/@src').extract_first()
food_rating = self.verify(food_rating)
food_name = response.xpath('//h1[@class="truncate_title_specific_product_page"]/text()').extract_first()
food_name = self.verify(food_name)
food_category_text = response.xpath('//div[@class="product_header product_header_updated dont_hide_on_mobile loaction_views_food_products_show"]/div/a[2]/text()').extract_first()
food_category_text = self.verify(food_category_text)
food_subcategory_text = response.xpath('//div[@class="product_header product_header_updated dont_hide_on_mobile loaction_views_food_products_show"]/div/a[3]/text()').extract_first()
food_subcategory_text = self.verify(food_subcategory_text)
# variable set 2/3
all_comments = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][1]/p/text()[1]').extract())
all_comments = self.verify(all_comments)
pos_neg_comment = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][1]/p/img/@alt').extract())
pos_neg_comment = self.verify(pos_neg_comment)
ingredients = response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][3]/p/text()[1]').extract()
ingredients = self.verify(ingredients)
certified_organic = response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][2]/p/text()[1]').extract_first()
certified_organic = self.verify(certified_organic)
allergens = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][2]/p/text()[1]').extract_first())
allergens = self.verify(allergens)
serving_amount = response.xpath('//thead[@class="performance-facts__header performance-facts__header2"]/tr/th/span/text()').extract_first()
serving_amount = self.verify(serving_amount)
serving_unit = response.xpath('//thead[@class="performance-facts__header performance-facts__header2"]/tr/th/text()[2]').extract_first()
serving_unit = self.verify(serving_unit)
calories = response.xpath('//th[@class="cal2 "]/div/text()').extract_first()
calories = self.verify(calories)
# variable set 3/3
nutri_fat_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract_first()
nutri_fat_perc = self.verify(nutri_fat_perc)
nutri_fat_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[1]
nutri_fat_num = self.verify(nutri_fat_num)
nutri_carbs_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[2]
nutri_carbs_perc = self.verify(nutri_carbs_perc)
nutri_carbs_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[3]
nutri_carbs_num = self.verify(nutri_carbs_num)
nutri_sugar_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[4]
nutri_sugar_num = self.verify(nutri_sugar_num)
nutri_protein_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[5]
nutri_protein_perc = self.verify(nutri_protein_perc)
nutri_protein_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[6]
nutri_protein_num = self.verify(nutri_protein_num)
nutri_perc_nutriName = str(response.xpath('//table[@class="nutrient_table"]//span[1]/text()').extract())
nutri_perc_nutriName = self.verify(nutri_perc_nutriName)
nutri_addedSugars = str(response.xpath('//tr[@class="thick-end"]//td[@class]/text()').extract())
nutri_addedSugars = self.verify(nutri_addedSugars)
rows = response.xpath('//table[@class="nutrient_table"]//tr')
dic = {}
for row in rows[1:]:
key = row.xpath("./td[2]/span/text()").extract_first()
value = row.xpath("./td[1]/span/text()").extract_first()
try:
key = key.strip()
dic[key] = value
except:
continue
item = FoodItem()
item['food_categories_text'] = food_categories_text
item['food_rating'] = food_rating
item['food_name'] = food_name
item['food_category_text'] = food_category_text
item['food_subcategory_text'] = food_subcategory_text
item['all_comments'] = all_comments
item['pos_neg_comment'] = pos_neg_comment
item['ingredients'] = ingredients
item['certified_organic'] = certified_organic
item['allergens'] = allergens
item['serving_amount'] = serving_amount
item['serving_unit'] = serving_unit
item['calories'] = calories
item['nutri_fat_perc'] = nutri_fat_perc
item['nutri_fat_num'] = nutri_fat_num
item['nutri_carbs_perc'] = nutri_carbs_perc
item['nutri_carbs_num'] = nutri_carbs_num
item['nutri_sugar_num'] = nutri_sugar_num
item['nutri_protein_perc'] = nutri_protein_perc
item['nutri_protein_num'] = nutri_protein_num
item['nutri_perc_nutriName'] = nutri_perc_nutriName
item['nutri_addedSugars'] = nutri_addedSugars
item['Total_Fat'] = dic.get('Total Fat', '')
item['Total_Carbs'] = dic.get('Total Carbs', '')
item['Sugars'] = dic.get('Sugars', '')
item['Protein'] = dic.get('Protein', '')
item['Saturated_Fat'] = dic.get('Saturated Fat', '')
item['Cholesterol'] = dic.get('Cholesterol', '')
item['Sodium'] = dic.get('Sodium', '')
item['Added_Sugar_Ingredients'] = dic.get('Added Sugar Ingredients', '')
item['Dietary_Fiber'] = dic.get('Dietary Fiber', '')
item['Vitamin_A'] = dic.get('Vitamin A', '')
item['Vitamin_C'] = dic.get('Vitamin C', '')
item['Calcium'] = dic.get('Calcium', '')
item['Iron'] = dic.get('Iron', '')
item['Potassium'] = dic.get('Potassium', '')
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment