Created
March 4, 2017 16:38
-
-
Save thomas-kassel/ddfda6d8d6766fcfa09165e90dc56611 to your computer and use it in GitHub Desktop.
Crawl spider to extract GTM article information
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy import Spider | |
from scrapy.spiders import CrawlSpider , Rule | |
from scrapy.linkextractors import LinkExtractor | |
from scrapy.http import Request | |
from scrapy.selector import Selector | |
from cleantechScrapy.items import GTMArticleItem | |
from cleantechScrapy.items import Cleantech100Item | |
##### Crawl Spider to scrape GTM articles ##### | |
class GTMspider(CrawlSpider): | |
name = 'GTM.spider' | |
allowed_urls = ['https://www.greentechmedia.com'] | |
start_urls = [ | |
'https://www.greentechmedia.com/articles', | |
'https://www.greentechmedia.com/articles/P25', | |
'https://www.greentechmedia.com/articles/P50' | |
] | |
rules = ( | |
# Follow links to articles and use parseArticle method at each article page | |
Rule(LinkExtractor(allow=('/articles/read/.*', )), callback='parseArticle'), | |
) | |
def parseArticle(self, response): | |
self.logger.info('Successfully crawled to %s', response.url) | |
# Initialize article item | |
item = GTMArticleItem() | |
# Parse article information using xpaths | |
item['theme'] = response.xpath('//div[@class="article-header-box"]/strong/text()').extract_first() | |
title = response.xpath('//h1[@class="article-page-heading"]/text()').extract_first() | |
title = title.encode('ascii','ignore') | |
item['title'] = title.strip() | |
body = response.xpath('//div[@class="col-md-9"]/p/descendant-or-self::text()').extract() | |
body = map(lambda x : x.encode('ascii','ignore'),body) | |
body = map(lambda x : x.replace('\t',"").replace('\n'," ").replace("\\'","'"),body) | |
item['body'] = ''.join(body) | |
item['tags'] = response.xpath('//ul[@class="tag-list"]/li/a/text()').extract() | |
item['comments'] = response.xpath('//span[@class="comment-count"]/a/text()').extract_first() | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment