thomas-kassel/cleantechScrapy.Spider.py

## cleantechScrapy.Spider.py
from scrapy import Spider
from scrapy.spiders import CrawlSpider , Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
from scrapy.selector import Selector
from cleantechScrapy.items import GTMArticleItem
from cleantechScrapy.items import Cleantech100Item


##### Crawl Spider to scrape GTM articles #####
class GTMspider(CrawlSpider):
	name = 'GTM.spider'
	allowed_urls = ['https://www.greentechmedia.com']
	start_urls = [
	'https://www.greentechmedia.com/articles',
	'https://www.greentechmedia.com/articles/P25',
	'https://www.greentechmedia.com/articles/P50'
	]

	rules = (
		# Follow links to articles and use parseArticle method at each article page
		Rule(LinkExtractor(allow=('/articles/read/.*', )), callback='parseArticle'),
		)

	def parseArticle(self, response):
		self.logger.info('Successfully crawled to %s', response.url)

		# Initialize article item
		item = GTMArticleItem()

		# Parse article information using xpaths
		item['theme'] = response.xpath('//div[@class="article-header-box"]/strong/text()').extract_first()
		title = response.xpath('//h1[@class="article-page-heading"]/text()').extract_first()
		title = title.encode('ascii','ignore')
		item['title'] = title.strip()
		body = response.xpath('//div[@class="col-md-9"]/p/descendant-or-self::text()').extract()
		body = map(lambda x : x.encode('ascii','ignore'),body)
		body = map(lambda x : x.replace('\t',"").replace('\n'," ").replace("\\'","'"),body)
		item['body'] = ''.join(body)
		item['tags'] = response.xpath('//ul[@class="tag-list"]/li/a/text()').extract()
		item['comments'] = response.xpath('//span[@class="comment-count"]/a/text()').extract_first()

		yield item
	from scrapy import Spider
	from scrapy.spiders import CrawlSpider , Rule
	from scrapy.linkextractors import LinkExtractor
	from scrapy.http import Request
	from scrapy.selector import Selector
	from cleantechScrapy.items import GTMArticleItem
	from cleantechScrapy.items import Cleantech100Item


	##### Crawl Spider to scrape GTM articles #####
	class GTMspider(CrawlSpider):
	name = 'GTM.spider'
	allowed_urls = ['https://www.greentechmedia.com']
	start_urls = [
	'https://www.greentechmedia.com/articles',
	'https://www.greentechmedia.com/articles/P25',
	'https://www.greentechmedia.com/articles/P50'
	]

	rules = (
	# Follow links to articles and use parseArticle method at each article page
	Rule(LinkExtractor(allow=('/articles/read/.*', )), callback='parseArticle'),
	)

	def parseArticle(self, response):
	self.logger.info('Successfully crawled to %s', response.url)

	# Initialize article item
	item = GTMArticleItem()

	# Parse article information using xpaths
	item['theme'] = response.xpath('//div[@class="article-header-box"]/strong/text()').extract_first()
	title = response.xpath('//h1[@class="article-page-heading"]/text()').extract_first()
	title = title.encode('ascii','ignore')
	item['title'] = title.strip()
	body = response.xpath('//div[@class="col-md-9"]/p/descendant-or-self::text()').extract()
	body = map(lambda x : x.encode('ascii','ignore'),body)
	body = map(lambda x : x.replace('\t',"").replace('\n'," ").replace("\\'","'"),body)
	item['body'] = ''.join(body)
	item['tags'] = response.xpath('//ul[@class="tag-list"]/li/a/text()').extract()
	item['comments'] = response.xpath('//span[@class="comment-count"]/a/text()').extract_first()

	yield item