Skip to content

Instantly share code, notes, and snippets.

@thomas-kassel
Created March 4, 2017 16:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomas-kassel/ddfda6d8d6766fcfa09165e90dc56611 to your computer and use it in GitHub Desktop.
Save thomas-kassel/ddfda6d8d6766fcfa09165e90dc56611 to your computer and use it in GitHub Desktop.
Crawl spider to extract GTM article information
from scrapy import Spider
from scrapy.spiders import CrawlSpider , Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
from scrapy.selector import Selector
from cleantechScrapy.items import GTMArticleItem
from cleantechScrapy.items import Cleantech100Item
##### Crawl Spider to scrape GTM articles #####
class GTMspider(CrawlSpider):
name = 'GTM.spider'
allowed_urls = ['https://www.greentechmedia.com']
start_urls = [
'https://www.greentechmedia.com/articles',
'https://www.greentechmedia.com/articles/P25',
'https://www.greentechmedia.com/articles/P50'
]
rules = (
# Follow links to articles and use parseArticle method at each article page
Rule(LinkExtractor(allow=('/articles/read/.*', )), callback='parseArticle'),
)
def parseArticle(self, response):
self.logger.info('Successfully crawled to %s', response.url)
# Initialize article item
item = GTMArticleItem()
# Parse article information using xpaths
item['theme'] = response.xpath('//div[@class="article-header-box"]/strong/text()').extract_first()
title = response.xpath('//h1[@class="article-page-heading"]/text()').extract_first()
title = title.encode('ascii','ignore')
item['title'] = title.strip()
body = response.xpath('//div[@class="col-md-9"]/p/descendant-or-self::text()').extract()
body = map(lambda x : x.encode('ascii','ignore'),body)
body = map(lambda x : x.replace('\t',"").replace('\n'," ").replace("\\'","'"),body)
item['body'] = ''.join(body)
item['tags'] = response.xpath('//ul[@class="tag-list"]/li/a/text()').extract()
item['comments'] = response.xpath('//span[@class="comment-count"]/a/text()').extract_first()
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment