seagatesoft/gist:ce49516edb34c33dea3f

## gistfile1.txt
import json

from scrapy.http import Request
from scrapy.selector import Selector

from forumbot.spiders.blogs import BlogSpider
from forumbot.spiders.mixins.livefyre import LivefyreMixin
from bot_engines.utils import error
from forumbot.items import BlogPostLoader, AuthorLoader


class MashableSpider(BlogSpider, LivefyreMixin):
    name = 'mashable.com'
    allowed_domains = ['mashable.com']
    start_urls = ['http://mashable.com/stories.json?new_per_page=20&'
                  'hot_per_page=0&rising_per_page=0']

    story_url = 'http://mashable.com/stories.json?new_per_page=20&' \
                'hot_per_page=0&rising_per_page=0&new_after={prev_post_key}'

    date_formats = ['%Y-%m-%dT%H:%M:%S']

    livefyre_url = 'http://bootstrap.mashable.fyre.co/bs3/v3.1/' \
                   'mashable.fyre.co/%s/%s/init'

    def parse(self, response):
        return self.parse_post_list(response)

    def parse_post_list(self, response):
        json_data = json.loads(response.body)
        date_data_log = []
        prev_post_key = ''

        for post in json_data['new']:
            prev_post_key = post['sort_key']
            date_string = post['post_date'][0:19]
            date_data = self.parse_date(date_string, response.url, date_formats=self.date_formats)
            if not date_data['date_obj']:
                error("Unknown date format %s in %s" % (repr(date_string),
                                                        response.url))
                continue

            post_date = date_data['date_obj']
            if not post_date:
                continue

            date_data_log.append(date_data)
            if self.has_valid_date(**date_data):
                post_item = dict(
                    item_id=post['_id'],
                    title=post['title'],
                    created_at=post_date
                )
                yield Request(url=post['link'],
                              meta=post_item,
                              callback=self.parse_item)

        try:
            limit_post_dd = date_data_log[-1]
        except IndexError:
            limit_post_dd = None

        if self.is_next_page_required(limit_post_dd):
            yield Request(
                self.story_url.format(prev_post_key=prev_post_key),
                callback=self.parse_post_list
            )

    def parse_item(self, response):
        pl = BlogPostLoader(response=response)
        pl.add_value('item_id', response.meta['item_id'])
        pl.add_value('title', response.meta['title'])
        pl.add_value('link', response.url)
        pl.add_value('created_at', response.meta['created_at'])
        pl.add_xpath('content',
                     '//article[@id="story"]/section[contains('
                     '@class, "article-content")]//*[not(self::script)]/'
                     'text()')

        al = AuthorLoader(selector=pl.selector.xpath(
            '//div[@class="article-info"]')
        )
        al.add_xpath('author_id', './a/@href', re=r'/people/(.+)/')
        # use author name in the post instead from API, they can be different
        al.add_xpath('name',
                     './/span[contains(., "By")]/text()',
                     re=r'By (.+)')
        al.add_xpath('link', './a/@href')
        al.add_xpath('avatar', './a/img[@class="author_image"]/@src')

        post = pl.load_item()
        post['author'] = al.load_item()

        if 'content' in post:
            yield post
            yield self.make_livefyre_request(response)
        else:
            special_post = pl.selector.xpath(
                '//div[@id="parsec"]/@data-post').extract()
            if special_post:
                json_data = json.loads(special_post[0])

                content = []
                for card in json_data['cards']:
                    if card['_type'] != 'Parsec::LongCard':
                        continue
                    for block in card['blocks']:
                        if block['_type'] == 'Parsec::TextBlock':
                            content.append(block['content'])

                pl.add_value('content', content)
                al.add_value_if_empty('author_id',
                                      str(json_data['wp_author_id']))
                al.add_value_if_empty('name', json_data['author'])
                post = pl.load_item()
                post['author'] = al.load_item()

                yield post

    def get_livefyre_data(self, response):
        sel = Selector(response)
        site_id = ''.join(
            sel.xpath('//div[@id="livefyre_comments"]/@data-site-id').extract()
        )
        article_id = ''.join(
            sel.xpath(
                '//div[@id="livefyre_comments"]/@data-article-id').extract()
        )
        return site_id, article_id
	import json

	from scrapy.http import Request
	from scrapy.selector import Selector

	from forumbot.spiders.blogs import BlogSpider
	from forumbot.spiders.mixins.livefyre import LivefyreMixin
	from bot_engines.utils import error
	from forumbot.items import BlogPostLoader, AuthorLoader


	class MashableSpider(BlogSpider, LivefyreMixin):
	name = 'mashable.com'
	allowed_domains = ['mashable.com']
	start_urls = ['http://mashable.com/stories.json?new_per_page=20&'
	'hot_per_page=0&rising_per_page=0']

	story_url = 'http://mashable.com/stories.json?new_per_page=20&' \
	'hot_per_page=0&rising_per_page=0&new_after={prev_post_key}'

	date_formats = ['%Y-%m-%dT%H:%M:%S']

	livefyre_url = 'http://bootstrap.mashable.fyre.co/bs3/v3.1/' \
	'mashable.fyre.co/%s/%s/init'

	def parse(self, response):
	return self.parse_post_list(response)

	def parse_post_list(self, response):
	json_data = json.loads(response.body)
	date_data_log = []
	prev_post_key = ''

	for post in json_data['new']:
	prev_post_key = post['sort_key']
	date_string = post['post_date'][0:19]
	date_data = self.parse_date(date_string, response.url, date_formats=self.date_formats)
	if not date_data['date_obj']:
	error("Unknown date format %s in %s" % (repr(date_string),
	response.url))
	continue

	post_date = date_data['date_obj']
	if not post_date:
	continue

	date_data_log.append(date_data)
	if self.has_valid_date(**date_data):
	post_item = dict(
	item_id=post['_id'],
	title=post['title'],
	created_at=post_date
	)
	yield Request(url=post['link'],
	meta=post_item,
	callback=self.parse_item)

	try:
	limit_post_dd = date_data_log[-1]
	except IndexError:
	limit_post_dd = None

	if self.is_next_page_required(limit_post_dd):
	yield Request(
	self.story_url.format(prev_post_key=prev_post_key),
	callback=self.parse_post_list
	)

	def parse_item(self, response):
	pl = BlogPostLoader(response=response)
	pl.add_value('item_id', response.meta['item_id'])
	pl.add_value('title', response.meta['title'])
	pl.add_value('link', response.url)
	pl.add_value('created_at', response.meta['created_at'])
	pl.add_xpath('content',
	'//article[@id="story"]/section[contains('
	'@class, "article-content")]//*[not(self::script)]/'
	'text()')

	al = AuthorLoader(selector=pl.selector.xpath(
	'//div[@class="article-info"]')
	)
	al.add_xpath('author_id', './a/@href', re=r'/people/(.+)/')
	# use author name in the post instead from API, they can be different
	al.add_xpath('name',
	'.//span[contains(., "By")]/text()',
	re=r'By (.+)')
	al.add_xpath('link', './a/@href')
	al.add_xpath('avatar', './a/img[@class="author_image"]/@src')

	post = pl.load_item()
	post['author'] = al.load_item()

	if 'content' in post:
	yield post
	yield self.make_livefyre_request(response)
	else:
	special_post = pl.selector.xpath(
	'//div[@id="parsec"]/@data-post').extract()
	if special_post:
	json_data = json.loads(special_post[0])

	content = []
	for card in json_data['cards']:
	if card['_type'] != 'Parsec::LongCard':
	continue
	for block in card['blocks']:
	if block['_type'] == 'Parsec::TextBlock':
	content.append(block['content'])

	pl.add_value('content', content)
	al.add_value_if_empty('author_id',
	str(json_data['wp_author_id']))
	al.add_value_if_empty('name', json_data['author'])
	post = pl.load_item()
	post['author'] = al.load_item()

	yield post

	def get_livefyre_data(self, response):
	sel = Selector(response)
	site_id = ''.join(
	sel.xpath('//div[@id="livefyre_comments"]/@data-site-id').extract()
	)
	article_id = ''.join(
	sel.xpath(
	'//div[@id="livefyre_comments"]/@data-article-id').extract()
	)
	return site_id, article_id