fouadyared/items.py

## items.py
import scrapy

class FoodItem(scrapy.Item):
    food_categories_text = scrapy.Field()
    food_rating = scrapy.Field()
    food_name = scrapy.Field()
    food_category_text = scrapy.Field()
    food_subcategory_text = scrapy.Field()
    all_comments = scrapy.Field()
    pos_neg_comment = scrapy.Field()
    ingredients = scrapy.Field()
    certified_organic = scrapy.Field()
    allergens = scrapy.Field()
    serving_amount = scrapy.Field()
    serving_unit = scrapy.Field()
    calories = scrapy.Field()
    nutri_fat_perc = scrapy.Field()
    nutri_fat_num = scrapy.Field()
    nutri_carbs_perc = scrapy.Field()
    nutri_carbs_num = scrapy.Field()
    nutri_sugar_num = scrapy.Field()
    nutri_protein_perc = scrapy.Field()
    nutri_protein_num = scrapy.Field()
    nutri_perc_nutriName = scrapy.Field()
    nutri_addedSugars = scrapy.Field()

    Total_Fat = scrapy.Field()
    Total_Carbs = scrapy.Field()
    Sugars = scrapy.Field()
    Protein = scrapy.Field()
    Saturated_Fat = scrapy.Field()
    Cholesterol = scrapy.Field()
    Sodium = scrapy.Field()

    Added_Sugar_Ingredients = scrapy.Field()
    Dietary_Fiber = scrapy.Field()
    Vitamin_A = scrapy.Field()
    Vitamin_C = scrapy.Field()
    Calcium = scrapy.Field()
    Iron = scrapy.Field()
    Potassium = scrapy.Field()

## middlewares.py
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class FoodSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

## pipelines.py
from scrapy.exporters import CsvItemExporter

class FoodPipeline(object):
    def process_item(self, item, spider):
        return item

class WriteItemPipeline(object):

    def __init__(self):
        self.filename = 'foodOne.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

## settings.py
# -*- coding: utf-8 -*-

# Scrapy settings for food project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'food'

SPIDER_MODULES = ['food.spiders']
NEWSPIDER_MODULE = 'food.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'food (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'food.middlewares.FoodSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'food.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'food.pipelines.FoodPipeline': 300,
#}

ITEM_PIPELINES = {'food.pipelines.WriteItemPipeline': 200}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

## spiders.py
from food.items import FoodItem
import scrapy
class food_spider(scrapy.Spider):
    name = 'food'
    allowed_urls = ['http://www.ewg.org/']
    start_urls = ['http://www.ewg.org/foodscores']

    def verify(self, content):
        if isinstance(content, list):
            #checks to see if content is a list
            if len(content) > 0:
                content = content[0]
                # convert unicode to str
                return content.encode('ascii', 'ignore')
            else:
                return ""
        elif (content is None):
            # convert unicode to str
            return "test"
        else:
            return content.encode('ascii', 'ignore')

    def parse(self, response):
        baby_food_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[0]
        candy_food_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[4]
        eggs_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[11]
        tofu_meatAlt_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[22]

        # not all the urls are in the same place.
        # this joins the four urls that are separate categories with all the others
        subcategories_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/ul/li/a/@href').extract()
        list1 = [baby_food_url, candy_food_url, eggs_url, tofu_meatAlt_url]
        url_list = list1 + subcategories_url

        import itertools
        pageurl = ['http://www.ewg.org' + l for l in url_list]

        for item_url in pageurl:
            for i in range(1, 100):
                try:
                    url_to_parse = item_url + '&page={}&per_page=12&type=products'.format(i)
                    yield scrapy.Request(url_to_parse, callback=self.parse_top)
                except:
                    print("Doesn't work...")
                    break

    def parse_top(self, response):
        food_categories_text = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/text()').extract()

        food_id = response.xpath('//div[@class="ind_result_text fleft"]/a/@href').extract()

        food_links = ['http://www.ewg.org' + id for id in food_id]

        for link in food_links:
            yield scrapy.Request(link, callback=self.parse_each, meta={'food_categories_text':food_categories_text})

    def parse_each(self, response):
        # variable set 1/3
        food_categories_text = response.meta['food_categories_text']
        food_categories_text = self.verify(food_categories_text)

        # url_to_parse = response.meta['url_to_parse']
        # url_to_parse = self.verify(url_to_parse)

        food_rating = response.xpath('//div[@class="updated_score fleft"]/img/@src').extract_first()
        food_rating = self.verify(food_rating)

        food_name = response.xpath('//h1[@class="truncate_title_specific_product_page"]/text()').extract_first()
        food_name = self.verify(food_name)

        food_category_text = response.xpath('//div[@class="product_header product_header_updated dont_hide_on_mobile loaction_views_food_products_show"]/div/a[2]/text()').extract_first()
        food_category_text = self.verify(food_category_text)

        food_subcategory_text = response.xpath('//div[@class="product_header product_header_updated dont_hide_on_mobile loaction_views_food_products_show"]/div/a[3]/text()').extract_first()
        food_subcategory_text = self.verify(food_subcategory_text)

        # variable set 2/3
        all_comments = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][1]/p/text()[1]').extract())
        all_comments = self.verify(all_comments)

        pos_neg_comment = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][1]/p/img/@alt').extract())
        pos_neg_comment = self.verify(pos_neg_comment)

        ingredients = response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][3]/p/text()[1]').extract()
        ingredients = self.verify(ingredients)

        certified_organic = response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][2]/p/text()[1]').extract_first()
        certified_organic = self.verify(certified_organic)

        allergens = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][2]/p/text()[1]').extract_first())
        allergens = self.verify(allergens)

        serving_amount = response.xpath('//thead[@class="performance-facts__header performance-facts__header2"]/tr/th/span/text()').extract_first()
        serving_amount = self.verify(serving_amount)

        serving_unit = response.xpath('//thead[@class="performance-facts__header performance-facts__header2"]/tr/th/text()[2]').extract_first()
        serving_unit = self.verify(serving_unit)

        calories = response.xpath('//th[@class="cal2 "]/div/text()').extract_first()
        calories = self.verify(calories)

        # variable set 3/3
        nutri_fat_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract_first()
        nutri_fat_perc = self.verify(nutri_fat_perc)

        nutri_fat_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[1]
        nutri_fat_num = self.verify(nutri_fat_num)

        nutri_carbs_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[2]
        nutri_carbs_perc = self.verify(nutri_carbs_perc)

        nutri_carbs_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[3]
        nutri_carbs_num = self.verify(nutri_carbs_num)

        nutri_sugar_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[4]
        nutri_sugar_num = self.verify(nutri_sugar_num)

        nutri_protein_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[5]
        nutri_protein_perc = self.verify(nutri_protein_perc)

        nutri_protein_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[6]
        nutri_protein_num = self.verify(nutri_protein_num)

        nutri_perc_nutriName = str(response.xpath('//table[@class="nutrient_table"]//span[1]/text()').extract())
        nutri_perc_nutriName = self.verify(nutri_perc_nutriName)

        nutri_addedSugars = str(response.xpath('//tr[@class="thick-end"]//td[@class]/text()').extract())
        nutri_addedSugars = self.verify(nutri_addedSugars)

        rows = response.xpath('//table[@class="nutrient_table"]//tr')

        dic = {}

        for row in rows[1:]:
            key = row.xpath("./td[2]/span/text()").extract_first()
            value = row.xpath("./td[1]/span/text()").extract_first()
            try:
                key = key.strip()
                dic[key] = value
            except:
                continue

        item = FoodItem()
        item['food_categories_text'] = food_categories_text
        item['food_rating'] = food_rating
        item['food_name'] = food_name
        item['food_category_text'] = food_category_text
        item['food_subcategory_text'] = food_subcategory_text
        item['all_comments'] = all_comments
        item['pos_neg_comment'] = pos_neg_comment
        item['ingredients'] = ingredients
        item['certified_organic'] = certified_organic
        item['allergens'] = allergens
        item['serving_amount'] = serving_amount
        item['serving_unit'] = serving_unit
        item['calories'] = calories
        item['nutri_fat_perc'] = nutri_fat_perc
        item['nutri_fat_num'] = nutri_fat_num
        item['nutri_carbs_perc'] = nutri_carbs_perc
        item['nutri_carbs_num'] = nutri_carbs_num
        item['nutri_sugar_num'] = nutri_sugar_num
        item['nutri_protein_perc'] = nutri_protein_perc
        item['nutri_protein_num'] = nutri_protein_num
        item['nutri_perc_nutriName'] = nutri_perc_nutriName
        item['nutri_addedSugars'] = nutri_addedSugars

        item['Total_Fat'] = dic.get('Total Fat', '')
        item['Total_Carbs'] = dic.get('Total Carbs', '')
        item['Sugars'] = dic.get('Sugars', '')
        item['Protein'] = dic.get('Protein', '')
        item['Saturated_Fat'] = dic.get('Saturated Fat', '')
        item['Cholesterol'] = dic.get('Cholesterol', '')
        item['Sodium'] = dic.get('Sodium', '')
        item['Added_Sugar_Ingredients'] = dic.get('Added Sugar Ingredients', '')
        item['Dietary_Fiber'] = dic.get('Dietary Fiber', '')
        item['Vitamin_A'] = dic.get('Vitamin A', '')
        item['Vitamin_C'] = dic.get('Vitamin C', '')
        item['Calcium'] = dic.get('Calcium', '')
        item['Iron'] = dic.get('Iron', '')
        item['Potassium'] = dic.get('Potassium', '')

        yield item
	import scrapy

	class FoodItem(scrapy.Item):
	food_categories_text = scrapy.Field()
	food_rating = scrapy.Field()
	food_name = scrapy.Field()
	food_category_text = scrapy.Field()
	food_subcategory_text = scrapy.Field()
	all_comments = scrapy.Field()
	pos_neg_comment = scrapy.Field()
	ingredients = scrapy.Field()
	certified_organic = scrapy.Field()
	allergens = scrapy.Field()
	serving_amount = scrapy.Field()
	serving_unit = scrapy.Field()
	calories = scrapy.Field()
	nutri_fat_perc = scrapy.Field()
	nutri_fat_num = scrapy.Field()
	nutri_carbs_perc = scrapy.Field()
	nutri_carbs_num = scrapy.Field()
	nutri_sugar_num = scrapy.Field()
	nutri_protein_perc = scrapy.Field()
	nutri_protein_num = scrapy.Field()
	nutri_perc_nutriName = scrapy.Field()
	nutri_addedSugars = scrapy.Field()

	Total_Fat = scrapy.Field()
	Total_Carbs = scrapy.Field()
	Sugars = scrapy.Field()
	Protein = scrapy.Field()
	Saturated_Fat = scrapy.Field()
	Cholesterol = scrapy.Field()
	Sodium = scrapy.Field()

	Added_Sugar_Ingredients = scrapy.Field()
	Dietary_Fiber = scrapy.Field()
	Vitamin_A = scrapy.Field()
	Vitamin_C = scrapy.Field()
	Calcium = scrapy.Field()
	Iron = scrapy.Field()
	Potassium = scrapy.Field()
	# -- coding: utf-8 --

	# Define here the models for your spider middleware
	#
	# See documentation in:
	# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

	from scrapy import signals


	class FoodSpiderMiddleware(object):
	# Not all methods need to be defined. If a method is not defined,
	# scrapy acts as if the spider middleware does not modify the
	# passed objects.

	@classmethod
	def from_crawler(cls, crawler):
	# This method is used by Scrapy to create your spiders.
	s = cls()
	crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
	return s

	def process_spider_input(response, spider):
	# Called for each response that goes through the spider
	# middleware and into the spider.

	# Should return None or raise an exception.
	return None

	def process_spider_output(response, result, spider):
	# Called with the results returned from the Spider, after
	# it has processed the response.

	# Must return an iterable of Request, dict or Item objects.
	for i in result:
	yield i

	def process_spider_exception(response, exception, spider):
	# Called when a spider or process_spider_input() method
	# (from other spider middleware) raises an exception.

	# Should return either None or an iterable of Response, dict
	# or Item objects.
	pass

	def process_start_requests(start_requests, spider):
	# Called with the start requests of the spider, and works
	# similarly to the process_spider_output() method, except
	# that it doesn’t have a response associated.

	# Must return only requests (not items).
	for r in start_requests:
	yield r

	def spider_opened(self, spider):
	spider.logger.info('Spider opened: %s' % spider.name)
	from scrapy.exporters import CsvItemExporter

	class FoodPipeline(object):
	def process_item(self, item, spider):
	return item

	class WriteItemPipeline(object):

	def __init__(self):
	self.filename = 'foodOne.csv'

	def open_spider(self, spider):
	self.csvfile = open(self.filename, 'wb')
	self.exporter = CsvItemExporter(self.csvfile)
	self.exporter.start_exporting()

	def close_spider(self, spider):
	self.exporter.finish_exporting()
	self.csvfile.close()

	def process_item(self, item, spider):
	self.exporter.export_item(item)
	return item
	# -- coding: utf-8 --

	# Scrapy settings for food project
	#
	# For simplicity, this file contains only settings considered important or
	# commonly used. You can find more settings consulting the documentation:
	#
	# http://doc.scrapy.org/en/latest/topics/settings.html
	# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
	# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

	BOT_NAME = 'food'

	SPIDER_MODULES = ['food.spiders']
	NEWSPIDER_MODULE = 'food.spiders'


	# Crawl responsibly by identifying yourself (and your website) on the user-agent
	#USER_AGENT = 'food (+http://www.yourdomain.com)'

	# Obey robots.txt rules
	ROBOTSTXT_OBEY = True

	# Configure maximum concurrent requests performed by Scrapy (default: 16)
	#CONCURRENT_REQUESTS = 32

	# Configure a delay for requests for the same website (default: 0)
	# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
	# See also autothrottle settings and docs
	#DOWNLOAD_DELAY = 3
	# The download delay setting will honor only one of:
	#CONCURRENT_REQUESTS_PER_DOMAIN = 16
	#CONCURRENT_REQUESTS_PER_IP = 16

	# Disable cookies (enabled by default)
	#COOKIES_ENABLED = False

	# Disable Telnet Console (enabled by default)
	#TELNETCONSOLE_ENABLED = False

	# Override the default request headers:
	#DEFAULT_REQUEST_HEADERS = {
	# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	# 'Accept-Language': 'en',
	#}

	# Enable or disable spider middlewares
	# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
	#SPIDER_MIDDLEWARES = {
	# 'food.middlewares.FoodSpiderMiddleware': 543,
	#}

	# Enable or disable downloader middlewares
	# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
	#DOWNLOADER_MIDDLEWARES = {
	# 'food.middlewares.MyCustomDownloaderMiddleware': 543,
	#}

	# Enable or disable extensions
	# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
	#EXTENSIONS = {
	# 'scrapy.extensions.telnet.TelnetConsole': None,
	#}

	# Configure item pipelines
	# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
	#ITEM_PIPELINES = {
	# 'food.pipelines.FoodPipeline': 300,
	#}

	ITEM_PIPELINES = {'food.pipelines.WriteItemPipeline': 200}

	# Enable and configure the AutoThrottle extension (disabled by default)
	# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
	#AUTOTHROTTLE_ENABLED = True
	# The initial download delay
	#AUTOTHROTTLE_START_DELAY = 5
	# The maximum download delay to be set in case of high latencies
	#AUTOTHROTTLE_MAX_DELAY = 60
	# The average number of requests Scrapy should be sending in parallel to
	# each remote server
	#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
	# Enable showing throttling stats for every response received:
	#AUTOTHROTTLE_DEBUG = False

	# Enable and configure HTTP caching (disabled by default)
	# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
	#HTTPCACHE_ENABLED = True
	#HTTPCACHE_EXPIRATION_SECS = 0
	#HTTPCACHE_DIR = 'httpcache'
	#HTTPCACHE_IGNORE_HTTP_CODES = []
	#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
	from food.items import FoodItem
	import scrapy
	class food_spider(scrapy.Spider):
	name = 'food'
	allowed_urls = ['http://www.ewg.org/']
	start_urls = ['http://www.ewg.org/foodscores']

	def verify(self, content):
	if isinstance(content, list):
	#checks to see if content is a list
	if len(content) > 0:
	content = content[0]
	# convert unicode to str
	return content.encode('ascii', 'ignore')
	else:
	return ""
	elif (content is None):
	# convert unicode to str
	return "test"
	else:
	return content.encode('ascii', 'ignore')

	def parse(self, response):
	baby_food_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[0]
	candy_food_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[4]
	eggs_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[11]
	tofu_meatAlt_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/@href').extract()[22]

	# not all the urls are in the same place.
	# this joins the four urls that are separate categories with all the others
	subcategories_url = response.xpath('//div[@id="dropdown_menu"]/ul/li/ul/li/a/@href').extract()
	list1 = [baby_food_url, candy_food_url, eggs_url, tofu_meatAlt_url]
	url_list = list1 + subcategories_url

	import itertools
	pageurl = ['http://www.ewg.org' + l for l in url_list]

	for item_url in pageurl:
	for i in range(1, 100):
	try:
	url_to_parse = item_url + '&page={}&per_page=12&type=products'.format(i)
	yield scrapy.Request(url_to_parse, callback=self.parse_top)
	except:
	print("Doesn't work...")
	break

	def parse_top(self, response):
	food_categories_text = response.xpath('//div[@id="dropdown_menu"]/ul/li/a/text()').extract()

	food_id = response.xpath('//div[@class="ind_result_text fleft"]/a/@href').extract()

	food_links = ['http://www.ewg.org' + id for id in food_id]

	for link in food_links:
	yield scrapy.Request(link, callback=self.parse_each, meta={'food_categories_text':food_categories_text})

	def parse_each(self, response):
	# variable set 1/3
	food_categories_text = response.meta['food_categories_text']
	food_categories_text = self.verify(food_categories_text)

	# url_to_parse = response.meta['url_to_parse']
	# url_to_parse = self.verify(url_to_parse)

	food_rating = response.xpath('//div[@class="updated_score fleft"]/img/@src').extract_first()
	food_rating = self.verify(food_rating)

	food_name = response.xpath('//h1[@class="truncate_title_specific_product_page"]/text()').extract_first()
	food_name = self.verify(food_name)

	food_category_text = response.xpath('//div[@class="product_header product_header_updated dont_hide_on_mobile loaction_views_food_products_show"]/div/a[2]/text()').extract_first()
	food_category_text = self.verify(food_category_text)

	food_subcategory_text = response.xpath('//div[@class="product_header product_header_updated dont_hide_on_mobile loaction_views_food_products_show"]/div/a[3]/text()').extract_first()
	food_subcategory_text = self.verify(food_subcategory_text)

	# variable set 2/3
	all_comments = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][1]/p/text()[1]').extract())
	all_comments = self.verify(all_comments)

	pos_neg_comment = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][1]/p/img/@alt').extract())
	pos_neg_comment = self.verify(pos_neg_comment)

	ingredients = response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][3]/p/text()[1]').extract()
	ingredients = self.verify(ingredients)

	certified_organic = response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][2]/p/text()[1]').extract_first()
	certified_organic = self.verify(certified_organic)

	allergens = str(response.xpath('//div[@class="gages_col_individual zeropadding bottom_space"][2]/p/text()[1]').extract_first())
	allergens = self.verify(allergens)

	serving_amount = response.xpath('//thead[@class="performance-facts__header performance-facts__header2"]/tr/th/span/text()').extract_first()
	serving_amount = self.verify(serving_amount)

	serving_unit = response.xpath('//thead[@class="performance-facts__header performance-facts__header2"]/tr/th/text()[2]').extract_first()
	serving_unit = self.verify(serving_unit)

	calories = response.xpath('//th[@class="cal2 "]/div/text()').extract_first()
	calories = self.verify(calories)

	# variable set 3/3
	nutri_fat_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract_first()
	nutri_fat_perc = self.verify(nutri_fat_perc)

	nutri_fat_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[1]
	nutri_fat_num = self.verify(nutri_fat_num)

	nutri_carbs_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[2]
	nutri_carbs_perc = self.verify(nutri_carbs_perc)

	nutri_carbs_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[3]
	nutri_carbs_num = self.verify(nutri_carbs_num)

	nutri_sugar_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[4]
	nutri_sugar_num = self.verify(nutri_sugar_num)

	nutri_protein_perc = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[5]
	nutri_protein_perc = self.verify(nutri_protein_perc)

	nutri_protein_num = response.xpath('//table[@class="nutrient_table"]//span[@class="update_on_ss_change"]/text()').extract()[6]
	nutri_protein_num = self.verify(nutri_protein_num)

	nutri_perc_nutriName = str(response.xpath('//table[@class="nutrient_table"]//span[1]/text()').extract())
	nutri_perc_nutriName = self.verify(nutri_perc_nutriName)

	nutri_addedSugars = str(response.xpath('//tr[@class="thick-end"]//td[@class]/text()').extract())
	nutri_addedSugars = self.verify(nutri_addedSugars)

	rows = response.xpath('//table[@class="nutrient_table"]//tr')

	dic = {}

	for row in rows[1:]:
	key = row.xpath("./td[2]/span/text()").extract_first()
	value = row.xpath("./td[1]/span/text()").extract_first()
	try:
	key = key.strip()
	dic[key] = value
	except:
	continue

	item = FoodItem()
	item['food_categories_text'] = food_categories_text
	item['food_rating'] = food_rating
	item['food_name'] = food_name
	item['food_category_text'] = food_category_text
	item['food_subcategory_text'] = food_subcategory_text
	item['all_comments'] = all_comments
	item['pos_neg_comment'] = pos_neg_comment
	item['ingredients'] = ingredients
	item['certified_organic'] = certified_organic
	item['allergens'] = allergens
	item['serving_amount'] = serving_amount
	item['serving_unit'] = serving_unit
	item['calories'] = calories
	item['nutri_fat_perc'] = nutri_fat_perc
	item['nutri_fat_num'] = nutri_fat_num
	item['nutri_carbs_perc'] = nutri_carbs_perc
	item['nutri_carbs_num'] = nutri_carbs_num
	item['nutri_sugar_num'] = nutri_sugar_num
	item['nutri_protein_perc'] = nutri_protein_perc
	item['nutri_protein_num'] = nutri_protein_num
	item['nutri_perc_nutriName'] = nutri_perc_nutriName
	item['nutri_addedSugars'] = nutri_addedSugars

	item['Total_Fat'] = dic.get('Total Fat', '')
	item['Total_Carbs'] = dic.get('Total Carbs', '')
	item['Sugars'] = dic.get('Sugars', '')
	item['Protein'] = dic.get('Protein', '')
	item['Saturated_Fat'] = dic.get('Saturated Fat', '')
	item['Cholesterol'] = dic.get('Cholesterol', '')
	item['Sodium'] = dic.get('Sodium', '')
	item['Added_Sugar_Ingredients'] = dic.get('Added Sugar Ingredients', '')
	item['Dietary_Fiber'] = dic.get('Dietary Fiber', '')
	item['Vitamin_A'] = dic.get('Vitamin A', '')
	item['Vitamin_C'] = dic.get('Vitamin C', '')
	item['Calcium'] = dic.get('Calcium', '')
	item['Iron'] = dic.get('Iron', '')
	item['Potassium'] = dic.get('Potassium', '')

	yield item