yaochao/spiders.py

## spiders.py
# -*- coding: utf-8 -*-
import sqlite3
import time

import scrapy


####################################  commercespider    ###########################
class SocialbetaSpider(scrapy.Spider):
    name = 'socialbeta'
    allowed_domains = ['socialbeta.com']
    start_urls = ['http://socialbeta.com/tag/案例']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//*[@class="postimg"]/li')
        for post in posts:
            post_url = response.urljoin(post.xpath('div/div/h3/a/@href').extract_first())
            post_title = post.xpath('div/div/h3/a/text()').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        content_text = ''.join(
            response.xpath('//div[@class="content"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
        content_html = response.xpath('//div[@class="content"]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        yield item


class QdailySpider(scrapy.Spider):
    name = 'qdaily'
    allowed_domains = ['qdaily.com']
    start_urls = ['http://www.qdaily.com/categories/18.html/']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//*[@class="packery-container articles"]/div')
        for post in posts:
            post_url = response.urljoin(post.xpath('a/@href').extract_first())
            post_title = post.xpath('a/div/div/img/@alt').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        # 文章的布局分为两种情况，根据文章中的元素做出解析(xpath)的选择。
        if response.xpath('//div[@class="main long-article"]'):
            content_text = ''.join(
                response.xpath('//div[@class="main long-article"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
            content_html = response.xpath('//div[@class="main long-article"]').extract_first()
        else:
            content_text = ''.join(
                response.xpath('//div[@class="detail"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
            content_html = response.xpath('//div[@class="detail"]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        yield item


class JiemianSpider(scrapy.Spider):
    name = 'jiemian'
    start_urls = ['http://www.jiemian.com/lists/49.html']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//*[@id="load-list"]/div')
        for post in posts:
            post_url = response.urljoin(
                post.xpath('div[@class="news-right"]/div[@class="news-header"]/h3/a/@href').extract_first())
            post_title = post.xpath('div[@class="news-right"]/div[@class="news-header"]/h3/a/text()').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        content_text = ''.join(
            response.xpath('//div[@class="article-main"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
        content_html = response.xpath('//div[@class="article-main"]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        yield item


class ToodaylabSpider(scrapy.Spider):
    name = 'toodaylab'
    start_urls = ['http://www.toodaylab.com/field/308']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//*[@class="content"]/div')
        for post in posts:
            post_url = response.urljoin(post.xpath('div[@class="post-info"]/p/a/@href').extract_first())
            post_title = post.xpath('div[@class="post-info"]/p/a/text()').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        content_text = ''.join(
            response.xpath('//div[@class="post-content"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
        content_html = response.xpath('//div[@class="post-content"]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        yield item


class MadisonboomSpider(scrapy.Spider):
    name = 'madisonboom'
    start_urls = ['http://www.madisonboom.com/category/works/']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//*[@id="gallery_list_elements"]/li')
        for post in posts:
            post_url = response.urljoin(post.xpath('h3/a/@href').extract_first())
            post_title = post.xpath('h3/p/a/text()').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        content_text = ''.join(
            response.xpath('//div[@class="slide-info"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
        content_html = response.xpath('//div[@class="slide-info"]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        yield item


class IwebadSpider(scrapy.Spider):
    name = 'iwebad'
    start_urls = ['http://iwebad.com/']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//*[@class="new_search_works"]/div')
        for post in posts:
            post_url = response.urljoin(post.xpath('div[@class="works_info"]/h4/span/a/@href').extract_first())
            post_title = post.xpath('div[@class="works_info"]/h4/span/a/text()').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        content_text = ''.join(
            response.xpath('//div[@class="news_ckkk "]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
        content_html = response.xpath('//div[@class="news_ckkk "]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        yield item


class AdquanSpider(scrapy.Spider):
    name = 'adquan'
    start_urls = ['http://www.adquan.com/']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//*[@class="work_list_left"]/div')
        for post in posts:
            post_url = response.urljoin(post.xpath('h2/a/@href').extract_first())
            post_title = post.xpath('h2/a/text()').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        content_text = ''.join(
            response.xpath('//div[@class="deta_inner"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
        content_html = response.xpath('//div[@class="deta_inner"]').extract_first()
        if not content_text:
            content_text = ''.join(
                response.xpath('//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
            content_html = response.xpath('//div[@class="con_Text"]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        yield item


class DigitalingSpider(scrapy.Spider):
    name = 'digitaling'
    start_urls = ['http://www.digitaling.com/projects']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//div[@id="pro_list"]/div')
        for post in posts:
            post_url = response.urljoin(post.xpath('div[@class="works_bd"]/div/h3/a/@href').extract_first())
            post_title = post.xpath('div[@class="works_bd"]/div/h3/a/text()').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        content_text = ''.join(
            response.xpath('//div[@id="article_con"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
        content_html = response.xpath('//div[@id="article_con"]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        yield item


class IresearchSpider(scrapy.Spider):
    name = 'iresearch'
    start_urls = ['http://a.iresearch.cn/']
    custom_settings = {
        'ITEM_PIPELINES': {
            # 'datapark.pipelines.BrandMongoPipeline': 300,
            # 'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//div[@id="tab-list"]/div/ul/li')
        for post in posts:
            post_url = response.urljoin(post.xpath('h3/a/@href').extract_first())
            post_title = post.xpath('h3/a/text()').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        content_text = ''.join(
            response.xpath('//div[@class="m-article"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
        content_html = response.xpath('//div[@class="m-article"]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        print item['content_text'], 'content_text'
        print item['content_html']


class EbrunSpider(scrapy.Spider):
    name = 'ebrun'
    start_urls = ['http://www.ebrun.com/brands/']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//div[@id="create10"]/div/div')
        for post in posts:
            post_url = response.urljoin(post.xpath('p/span/a/@href').extract_first())
            post_title = post.xpath('p/span/a/text()').extract_first()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        content_text = ''.join(
            response.xpath('//div[@class="clearfix cmsDiv"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
        content_html = response.xpath('//div[@class="clearfix cmsDiv"]').extract_first()
        item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
        item['content_html'] = content_html
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'commerce'
        item['module'] = 'brand'
        yield item


####################################  conferencespider    ###########################
class Eshow365Spider(scrapy.Spider):
    name = 'eshow365'
    start_urls = ['http://www.eshow365.com/zhanhui/0-0-0-0/0/%E5%B9%BF%E5%91%8A%20%E8%90%A5%E9%94%80']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//div[@class="sslist"]')
        for post in posts:
            post_url = response.urljoin(post.xpath('p[@class="zhtitle"]/a/@href').extract_first())
            post_title = post.xpath('p[@class="zhtitle"]/a//text()').extract()
            post_title = ''.join(post_title).strip()
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return

            request = scrapy.Request(url=post_url, callback=self.parse_post)
            request.meta['item'] = item
            yield request

    def parse_post(self, response):
        item = response.meta['item']
        ps = response.xpath('//div[@class="zhxxcontent"]/p')
        conference_time = ''
        conference_address = ''
        for index, p in enumerate(ps):
            txt = p.xpath('string(.)').extract_first()
            if u'举办时间' in txt:
                conference_time = txt.split(u'举办时间：')[-1]
            if u'举办展馆' in txt:
                conference_address = txt.split(u'举办展馆：')[-1]
        item['conference_time'] = conference_time
        item['conference_address'] = conference_address
        item['crawl_time'] = int(time.time())
        item['site_name'] = self.name
        item['type'] = 'conference'
        item['module'] = 'brand'
        yield item


class Events_ireasearchSpider(scrapy.Spider):
    name = 'events_ireasearch'
    start_urls = ['http://events.iresearch.cn/']
    custom_settings = {
        'ITEM_PIPELINES': {
            'datapark.pipelines.BrandMongoPipeline': 300,
            'datapark.pipelines.BrandKafkaPipeline': 301,
        }
    }

    # 自定义属性
    first_url = ''
    connection = sqlite3.connect('data.sqlite')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
    connection.commit()
    cursor.execute('SELECT latest_url FROM %s' % name)
    latest_url = cursor.fetchone()
    if latest_url:
        latest_url = latest_url[0]

    def parse(self, response):
        posts = response.xpath('//*[@id="databox"]/li')
        for post in posts:
            post_url = response.urljoin(post.xpath('div[@class="info"]/h3/a/@href').extract_first())
            post_title = post.xpath('div[@class="info"]/h3/a/text()').extract_first()
            conference_info = post.xpath('div[@class="info"]/p/text()').extract_first()
            conference_time = conference_info.split(' ')[0]
            conference_address = conference_info.split(' ')[-1]
            item = {
                '_id': post_url,
                'post_url': post_url,
                'post_title': post_title,
                'conference_time': conference_time,
                'conference_address': conference_address,
                'crawl_time': int(time.time()),
                'site_name': self.name,
                'type': 'conference',
                'module': 'brand',
            }

            # 把第一条数据作为最新的数据，存储到sqlite中
            if not self.first_url:
                self.first_url = post_url
                self.cursor.execute('DELETE FROM %s' % self.name)
                self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
                self.connection.commit()
            # 从sqlite中取出上一次最新的数据，与本次的数据做对比，如果相同则认为文章抓到了上次已经抓过的数据，如果不同则认为文章还没有抓完
            if post_url == self.latest_url:
                print '%s - 爬到了上次爬到的地方' % self.name
                self.connection.close()
                return
            yield item