Watemlifts/googlebiz.py

## googlebiz.py
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import GoogleItem

# This is the class that does work.
class LoginSpider(BaseSpider):
    name = 'google-login'
    start_urls = ['https://accounts.google.com/ServiceLogin?service=lbc&passive=1209600&continue=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&followup=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&hl=en-US']

    def parse(self, response):
        """
        This overrides the builtin function parse() and forces us to login
        to the google service.
        """
        return [FormRequest.from_response(response,
                formdata={'Email': 'not telling', 'Passwd': 'also not telling'},
                callback = self.after_login)]

    def after_login(self, response):
        """
        This is the callback from the login function and does the
        actual parsing
        """

        # Display the body in the console
        print response.body

        # Create an xpath selector
        hxs = HtmlXPathSelector(response)

        # Container for item objects
        items = []

        # Get all of the links to the next pages
        links = hxs.select('//a[contains(@href, "/local/add/analytics?storeid=")]')

        # Go through all of the links on the analytics page
        for l in links:

            # Create a new google item
            item = GoogleItem()

            # Assign the values
            item['link'] = l.select('@href').extract()
            item['value'] = l.select('text()').extract()
            item['next_link'] = ''

            items.append(item)

        # This plucks the 'Next >>' links from the site
        next_links = hxs.select('//a[contains(text(), "Next")]')
        for n in next_links:
            item = GoogleItem()

            item['link'] = n.select('@href').extract()
            item['value'] = n.select('text()').extract()
            item['next_link'] = n.select('@href').extract()

            items.append(item)

        return items


# This is the class that does not work
class GoogleSpider(CrawlSpider):
    name = 'google-spider'
    allowed_domains = ['google.com']
    start_urls = [
        'https://accounts.google.com/ServiceLogin?service=lbc&passive=1209600&continue=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&followup=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&hl=en-US'
    ]

    rules = (

        Rule(SgmlLinkExtractor(allow=('/local/add/businessCenter?page=', ))),
        )

    def init_request(self):
        """
        This is called initially
        """
        return self.login()

    def login(self):
        """
        This is where I am stuck.  Obviously response is not defined.
        """
        return [FormRequest.from_response(response,
            formdata={'Email': 'localsearch@banfield.net', 'Passwd': 'Since1955'},
            callback = self.after_login)]


    def after_login(self):
        """
        Required for the crawler to start crawling
        """
        self.initialized()


    def parse_items(self, response):

        print response.body

        hxs = HtmlXPathSelector(response)

        items = []
        # Get all of the links to the next pages
        links = hxs.select('//a[contains(@href, "/local/add/analytics?storeid=")]')

        for l in links:
            item = GoogleItem()

            item['link'] = l.select('@href').extract()
            item['value'] = l.select('text()').extract()
            item['next_link'] = ''

            items.append(item)

        next_links = hxs.select('//a[contains(text(), "Next")]')
        for n in next_links:
            item = GoogleItem()

            item['link'] = n.select('@href').extract()
            item['value'] = n.select('text()').extract()
            item['next_link'] = n.select('@href').extract()

            items.append(item)

        return items

## items.py
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html

from scrapy.item import Item, Field


class GoogleItem(Item):
    link = Field()
    value = Field()
    next_link = Field()
	from scrapy.spider import BaseSpider
	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.http import FormRequest
	from scrapy.selector import HtmlXPathSelector
	from tutorial.items import GoogleItem

	# This is the class that does work.
	class LoginSpider(BaseSpider):
	name = 'google-login'
	start_urls = ['https://accounts.google.com/ServiceLogin?service=lbc&passive=1209600&continue=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&followup=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&hl=en-US']

	def parse(self, response):
	"""
	This overrides the builtin function parse() and forces us to login
	to the google service.
	"""
	return [FormRequest.from_response(response,
	formdata={'Email': 'not telling', 'Passwd': 'also not telling'},
	callback = self.after_login)]

	def after_login(self, response):
	"""
	This is the callback from the login function and does the
	actual parsing
	"""

	# Display the body in the console
	print response.body

	# Create an xpath selector
	hxs = HtmlXPathSelector(response)

	# Container for item objects
	items = []

	# Get all of the links to the next pages
	links = hxs.select('//a[contains(@href, "/local/add/analytics?storeid=")]')

	# Go through all of the links on the analytics page
	for l in links:

	# Create a new google item
	item = GoogleItem()

	# Assign the values
	item['link'] = l.select('@href').extract()
	item['value'] = l.select('text()').extract()
	item['next_link'] = ''

	items.append(item)

	# This plucks the 'Next >>' links from the site
	next_links = hxs.select('//a[contains(text(), "Next")]')
	for n in next_links:
	item = GoogleItem()

	item['link'] = n.select('@href').extract()
	item['value'] = n.select('text()').extract()
	item['next_link'] = n.select('@href').extract()

	items.append(item)

	return items


	# This is the class that does not work
	class GoogleSpider(CrawlSpider):
	name = 'google-spider'
	allowed_domains = ['google.com']
	start_urls = [
	'https://accounts.google.com/ServiceLogin?service=lbc&passive=1209600&continue=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&followup=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&hl=en-US'
	]

	rules = (

	Rule(SgmlLinkExtractor(allow=('/local/add/businessCenter?page=', ))),
	)

	def init_request(self):
	"""
	This is called initially
	"""
	return self.login()

	def login(self):
	"""
	This is where I am stuck. Obviously response is not defined.
	"""
	return [FormRequest.from_response(response,
	formdata={'Email': 'localsearch@banfield.net', 'Passwd': 'Since1955'},
	callback = self.after_login)]


	def after_login(self):
	"""
	Required for the crawler to start crawling
	"""
	self.initialized()


	def parse_items(self, response):

	print response.body

	hxs = HtmlXPathSelector(response)

	items = []
	# Get all of the links to the next pages
	links = hxs.select('//a[contains(@href, "/local/add/analytics?storeid=")]')

	for l in links:
	item = GoogleItem()

	item['link'] = l.select('@href').extract()
	item['value'] = l.select('text()').extract()
	item['next_link'] = ''

	items.append(item)

	next_links = hxs.select('//a[contains(text(), "Next")]')
	for n in next_links:
	item = GoogleItem()

	item['link'] = n.select('@href').extract()
	item['value'] = n.select('text()').extract()
	item['next_link'] = n.select('@href').extract()

	items.append(item)

	return items
	# Define here the models for your scraped items
	#
	# See documentation in:
	# http://doc.scrapy.org/topics/items.html

	from scrapy.item import Item, Field


	class GoogleItem(Item):
	link = Field()
	value = Field()
	next_link = Field()