Last active
June 28, 2017 14:22
-
-
Save fouadyared/3538e825770bc81dc21b3124bebe4205 to your computer and use it in GitHub Desktop.
Files for "Harmful ingredients and added sugars" Web Scraping Project (by Fouad Yared)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class FoodItem(scrapy.Item): | |
food_categories_text = scrapy.Field() | |
food_rating = scrapy.Field() | |
food_name = scrapy.Field() | |
food_category_text = scrapy.Field() | |
food_subcategory_text = scrapy.Field() | |
all_comments = scrapy.Field() | |
pos_neg_comment = scrapy.Field() | |
ingredients = scrapy.Field() | |
certified_organic = scrapy.Field() | |
allergens = scrapy.Field() | |
serving_amount = scrapy.Field() | |
serving_unit = scrapy.Field() | |
calories = scrapy.Field() | |
nutri_fat_perc = scrapy.Field() | |
nutri_fat_num = scrapy.Field() | |
nutri_carbs_perc = scrapy.Field() | |
nutri_carbs_num = scrapy.Field() | |
nutri_sugar_num = scrapy.Field() | |
nutri_protein_perc = scrapy.Field() | |
nutri_protein_num = scrapy.Field() | |
nutri_perc_nutriName = scrapy.Field() | |
nutri_addedSugars = scrapy.Field() | |
Total_Fat = scrapy.Field() | |
Total_Carbs = scrapy.Field() | |
Sugars = scrapy.Field() | |
Protein = scrapy.Field() | |
Saturated_Fat = scrapy.Field() | |
Cholesterol = scrapy.Field() | |
Sodium = scrapy.Field() | |
Added_Sugar_Ingredients = scrapy.Field() | |
Dietary_Fiber = scrapy.Field() | |
Vitamin_A = scrapy.Field() | |
Vitamin_C = scrapy.Field() | |
Calcium = scrapy.Field() | |
Iron = scrapy.Field() | |
Potassium = scrapy.Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Define here the models for your spider middleware | |
# | |
# See documentation in: | |
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html | |
from scrapy import signals | |
class FoodSpiderMiddleware(object): | |
# Not all methods need to be defined. If a method is not defined, | |
# scrapy acts as if the spider middleware does not modify the | |
# passed objects. | |
@classmethod | |
def from_crawler(cls, crawler): | |
# This method is used by Scrapy to create your spiders. | |
s = cls() | |
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) | |
return s | |
def process_spider_input(response, spider): | |
# Called for each response that goes through the spider | |
# middleware and into the spider. | |
# Should return None or raise an exception. | |
return None | |
def process_spider_output(response, result, spider): | |
# Called with the results returned from the Spider, after | |
# it has processed the response. | |
# Must return an iterable of Request, dict or Item objects. | |
for i in result: | |
yield i | |
def process_spider_exception(response, exception, spider): | |
# Called when a spider or process_spider_input() method | |
# (from other spider middleware) raises an exception. | |
# Should return either None or an iterable of Response, dict | |
# or Item objects. | |
pass | |
def process_start_requests(start_requests, spider): | |
# Called with the start requests of the spider, and works | |
# similarly to the process_spider_output() method, except | |
# that it doesn’t have a response associated. | |
# Must return only requests (not items). | |
for r in start_requests: | |
yield r | |
def spider_opened(self, spider): | |
spider.logger.info('Spider opened: %s' % spider.name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.exporters import CsvItemExporter | |
class FoodPipeline(object): | |
def process_item(self, item, spider): | |
return item | |
class WriteItemPipeline(object): | |
def __init__(self): | |
self.filename = 'foodOne.csv' | |
def open_spider(self, spider): | |
self.csvfile = open(self.filename, 'wb') | |
self.exporter = CsvItemExporter(self.csvfile) | |
self.exporter.start_exporting() | |
def close_spider(self, spider): | |
self.exporter.finish_exporting() | |
self.csvfile.close() | |
def process_item(self, item, spider): | |
self.exporter.export_item(item) | |
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Scrapy settings for food project | |
# | |
# For simplicity, this file contains only settings considered important or | |
# commonly used. You can find more settings consulting the documentation: | |
# | |
# http://doc.scrapy.org/en/latest/topics/settings.html | |
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | |
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | |
BOT_NAME = 'food' | |
SPIDER_MODULES = ['food.spiders'] | |
NEWSPIDER_MODULE = 'food.spiders' | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
#USER_AGENT = 'food (+http://www.yourdomain.com)' | |
# Obey robots.txt rules | |
ROBOTSTXT_OBEY = True | |
# Configure maximum concurrent requests performed by Scrapy (default: 16) | |
#CONCURRENT_REQUESTS = 32 | |
# Configure a delay for requests for the same website (default: 0) | |
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay | |
# See also autothrottle settings and docs | |
#DOWNLOAD_DELAY = 3 | |
# The download delay setting will honor only one of: | |
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 | |
#CONCURRENT_REQUESTS_PER_IP = 16 | |
# Disable cookies (enabled by default) | |
#COOKIES_ENABLED = False | |
# Disable Telnet Console (enabled by default) | |
#TELNETCONSOLE_ENABLED = False | |
# Override the default request headers: | |
#DEFAULT_REQUEST_HEADERS = { | |
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
# 'Accept-Language': 'en', | |
#} | |
# Enable or disable spider middlewares | |
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | |
#SPIDER_MIDDLEWARES = { | |
# 'food.middlewares.FoodSpiderMiddleware': 543, | |
#} | |
# Enable or disable downloader middlewares | |
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | |
#DOWNLOADER_MIDDLEWARES = { | |
# 'food.middlewares.MyCustomDownloaderMiddleware': 543, | |
#} | |
# Enable or disable extensions | |
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html | |
#EXTENSIONS = { | |
# 'scrapy.extensions.telnet.TelnetConsole': None, | |
#} | |
# Configure item pipelines | |
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html | |
#ITEM_PIPELINES = { | |
# 'food.pipelines.FoodPipeline': 300, | |
#} | |
ITEM_PIPELINES = {'food.pipelines.WriteItemPipeline': 200} | |
# Enable and configure the AutoThrottle extension (disabled by default) | |
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html | |
#AUTOTHROTTLE_ENABLED = True | |
# The initial download delay | |
#AUTOTHROTTLE_START_DELAY = 5 | |
# The maximum download delay to be set in case of high latencies | |
#AUTOTHROTTLE_MAX_DELAY = 60 | |
# The average number of requests Scrapy should be sending in parallel to | |
# each remote server | |
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | |
# Enable showing throttling stats for every response received: | |
#AUTOTHROTTLE_DEBUG = False | |
# Enable and configure HTTP caching (disabled by default) | |
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | |
#HTTPCACHE_ENABLED = True | |
#HTTPCACHE_EXPIRATION_SECS = 0 | |
#HTTPCACHE_DIR = 'httpcache' | |
#HTTPCACHE_IGNORE_HTTP_CODES = [] | |
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from food.items import FoodItem | |
import scrapy | |
class food_spider(scrapy.Spider): | |
name = 'food' | |
allowed_urls = ['http://www.ewg.org/'] | |
start_urls = ['http://www.ewg.org/foodscores'] | |
def verify(self, content): | |
if isinstance(content, list): | |
#checks to see if content is a list | |
if len(content) > 0: | |
content = content[0] | |
# convert unicode to str | |
return content.encode('ascii', 'ignore') | |
else: | |
return "" | |
elif (content is None): | |
# convert unicode to str | |
return "test" | |
else: | |
return content.encode('ascii', 'ignore') | |
def parse(self, response): | |
baby_food_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[0] | |
candy_food_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[4] | |
eggs_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[11] | |
tofu_meatAlt_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[22] | |
# not all the urls are in the same place. | |
# this joins the four urls that are separate categories with all the others | |
subcategories_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/ul/li/a/@href').extract() | |
list1 = [baby_food_url, candy_food_url, eggs_url, tofu_meatAlt_url] | |
url_list = list1 + subcategories_url | |
import itertools | |
pageurl = ['http://www.ewg.org' + l for l in url_list] | |
for item_url in pageurl: | |
for i in range(1, 100): | |
try: | |
url_to_parse = item_url + '&page={}&per_page=12&type=products'.format(i) | |
yield scrapy.Request(url_to_parse, callback=self.parse_top) | |
except: | |
print("Doesn't work...") | |
break | |
def parse_top(self, response): | |
food_categories_text = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/text()').extract() | |
food_id = response.xpath('//div[@class="ind_result_text fleft"]/a/@href').extract() | |
food_links = ['http://www.ewg.org' + id for id in food_id] | |
for link in food_links: | |
yield scrapy.Request(link, callback=self.parse_each, meta={'food_categories_text':food_categories_text}) | |
def parse_each(self, response): | |
# variable set 1/3 | |
food_categories_text = response.meta['food_categories_text'] | |
food_categories_text = self.verify(food_categories_text) | |
# url_to_parse = response.meta['url_to_parse'] | |
# url_to_parse = self.verify(url_to_parse) | |
food_rating = response.xpath('//div[@class="updated_score fleft"]/img/@src').extract_first() | |
food_rating = self.verify(food_rating) | |
food_name = response.xpath('//h1[@class="truncate_title_specific_product_page"]/text()').extract_first() | |
food_name = self.verify(food_name) | |
food_category_text = response.xpath('//div[@class="product_header product_header_updated dont_hide_on_mobile loaction_views_food_products_show"]/div/a[2]/text()').extract_first() | |
food_category_text = self.verify(food_category_text) | |
food_subcategory_text = response.xpath('//div[@class="product_header product_header_updated dont_hide_on_mobile loaction_views_food_products_show"]/div/a[3]/text()').extract_first() | |
food_subcategory_text = self.verify(food_subcategory_text) | |
# variable set 2/3 | |
all_comments = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][1]/p/text()[1]').extract()) | |
all_comments = self.verify(all_comments) | |
pos_neg_comment = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][1]/p/img/@alt').extract()) | |
pos_neg_comment = self.verify(pos_neg_comment) | |
ingredients = response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][3]/p/text()[1]').extract() | |
ingredients = self.verify(ingredients) | |
certified_organic = response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][2]/p/text()[1]').extract_first() | |
certified_organic = self.verify(certified_organic) | |
allergens = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][2]/p/text()[1]').extract_first()) | |
allergens = self.verify(allergens) | |
serving_amount = response.xpath('//thead[@class="performance-facts__header performance-facts__header2"]/tr/th/span/text()').extract_first() | |
serving_amount = self.verify(serving_amount) | |
serving_unit = response.xpath('//thead[@class="performance-facts__header performance-facts__header2"]/tr/th/text()[2]').extract_first() | |
serving_unit = self.verify(serving_unit) | |
calories = response.xpath('//th[@class="cal2 "]/div/text()').extract_first() | |
calories = self.verify(calories) | |
# variable set 3/3 | |
nutri_fat_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract_first() | |
nutri_fat_perc = self.verify(nutri_fat_perc) | |
nutri_fat_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[1] | |
nutri_fat_num = self.verify(nutri_fat_num) | |
nutri_carbs_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[2] | |
nutri_carbs_perc = self.verify(nutri_carbs_perc) | |
nutri_carbs_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[3] | |
nutri_carbs_num = self.verify(nutri_carbs_num) | |
nutri_sugar_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[4] | |
nutri_sugar_num = self.verify(nutri_sugar_num) | |
nutri_protein_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[5] | |
nutri_protein_perc = self.verify(nutri_protein_perc) | |
nutri_protein_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[6] | |
nutri_protein_num = self.verify(nutri_protein_num) | |
nutri_perc_nutriName = str(response.xpath('//table[@class="nutrient_table"]//span[1]/text()').extract()) | |
nutri_perc_nutriName = self.verify(nutri_perc_nutriName) | |
nutri_addedSugars = str(response.xpath('//tr[@class="thick-end"]//td[@class]/text()').extract()) | |
nutri_addedSugars = self.verify(nutri_addedSugars) | |
rows = response.xpath('//table[@class="nutrient_table"]//tr') | |
dic = {} | |
for row in rows[1:]: | |
key = row.xpath("./td[2]/span/text()").extract_first() | |
value = row.xpath("./td[1]/span/text()").extract_first() | |
try: | |
key = key.strip() | |
dic[key] = value | |
except: | |
continue | |
item = FoodItem() | |
item['food_categories_text'] = food_categories_text | |
item['food_rating'] = food_rating | |
item['food_name'] = food_name | |
item['food_category_text'] = food_category_text | |
item['food_subcategory_text'] = food_subcategory_text | |
item['all_comments'] = all_comments | |
item['pos_neg_comment'] = pos_neg_comment | |
item['ingredients'] = ingredients | |
item['certified_organic'] = certified_organic | |
item['allergens'] = allergens | |
item['serving_amount'] = serving_amount | |
item['serving_unit'] = serving_unit | |
item['calories'] = calories | |
item['nutri_fat_perc'] = nutri_fat_perc | |
item['nutri_fat_num'] = nutri_fat_num | |
item['nutri_carbs_perc'] = nutri_carbs_perc | |
item['nutri_carbs_num'] = nutri_carbs_num | |
item['nutri_sugar_num'] = nutri_sugar_num | |
item['nutri_protein_perc'] = nutri_protein_perc | |
item['nutri_protein_num'] = nutri_protein_num | |
item['nutri_perc_nutriName'] = nutri_perc_nutriName | |
item['nutri_addedSugars'] = nutri_addedSugars | |
item['Total_Fat'] = dic.get('Total Fat', '') | |
item['Total_Carbs'] = dic.get('Total Carbs', '') | |
item['Sugars'] = dic.get('Sugars', '') | |
item['Protein'] = dic.get('Protein', '') | |
item['Saturated_Fat'] = dic.get('Saturated Fat', '') | |
item['Cholesterol'] = dic.get('Cholesterol', '') | |
item['Sodium'] = dic.get('Sodium', '') | |
item['Added_Sugar_Ingredients'] = dic.get('Added Sugar Ingredients', '') | |
item['Dietary_Fiber'] = dic.get('Dietary Fiber', '') | |
item['Vitamin_A'] = dic.get('Vitamin A', '') | |
item['Vitamin_C'] = dic.get('Vitamin C', '') | |
item['Calcium'] = dic.get('Calcium', '') | |
item['Iron'] = dic.get('Iron', '') | |
item['Potassium'] = dic.get('Potassium', '') | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment