hofaiwong/comics_spider.py Secret

## comics_spider.py
class ComicsSpider(Spider):
    name = 'comics'
    allowed_urls = ['comicbookroundup.com']

    f = open("urls.txt")
    start_urls = [url.strip() for url in f.readlines()]
    f.close()

    # Generic function to get details from xpath in response
    def getdetail(self, response, path, function, index, alt):
        try:
            return function(response.xpath(path).extract()[index])
        except Exception:
            return alt

    # Main parse function
    def parse(self, response):
        rows = response.xpath('//*[@id="all-series"]/div/table[2]/tr').extract()
        for row in rows:
            series_url = Selector(text=row).xpath('//td[2]/a/@href').extract()
            yield Request('http://comicbookroundup.com'+series_url[0], callback=self.parse_series_contents)

    # Function to parse contents of a given series page
    def parse_series_contents(self, response):
        item = ComicsItem()

        # Scrape data from the top of the series page
        # ...

        # Scrape data from table of issues in the series page
        rows = response.xpath('//*[@id="issues"]/div[1]/table[2]/tr').extract()
        item['issues_list'] = {}
        for row in rows:
            # Scrape issue data from each row
            # ...
            item['issues_list'][str(issue)] = {'rating_critic': rating_critic,
                                               'rating_user': rating_user,
                                               'writer': writer,
                                               'artist': artist,
                                               'reviews_critic_count': reviews_critic_count,
                                               'reviews_user_count': reviews_user_count}
        yield item
	class ComicsSpider(Spider):
	name = 'comics'
	allowed_urls = ['comicbookroundup.com']

	f = open("urls.txt")
	start_urls = [url.strip() for url in f.readlines()]
	f.close()

	# Generic function to get details from xpath in response
	def getdetail(self, response, path, function, index, alt):
	try:
	return function(response.xpath(path).extract()[index])
	except Exception:
	return alt

	# Main parse function
	def parse(self, response):
	rows = response.xpath('//*[@id="all-series"]/div/table[2]/tr').extract()
	for row in rows:
	series_url = Selector(text=row).xpath('//td[2]/a/@href').extract()
	yield Request('http://comicbookroundup.com'+series_url[0], callback=self.parse_series_contents)

	# Function to parse contents of a given series page
	def parse_series_contents(self, response):
	item = ComicsItem()

	# Scrape data from the top of the series page
	# ...

	# Scrape data from table of issues in the series page
	rows = response.xpath('//*[@id="issues"]/div[1]/table[2]/tr').extract()
	item['issues_list'] = {}
	for row in rows:
	# Scrape issue data from each row
	# ...
	item['issues_list'][str(issue)] = {'rating_critic': rating_critic,
	'rating_user': rating_user,
	'writer': writer,
	'artist': artist,
	'reviews_critic_count': reviews_critic_count,
	'reviews_user_count': reviews_user_count}
	yield item