Skip to content

Instantly share code, notes, and snippets.

@yaochao
Created July 21, 2017 03:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yaochao/7b5f87c562f23713d217f7fd4698b327 to your computer and use it in GitHub Desktop.
Save yaochao/7b5f87c562f23713d217f7fd4698b327 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import sqlite3
import time
import scrapy
#################################### commercespider ###########################
class SocialbetaSpider(scrapy.Spider):
name = 'socialbeta'
allowed_domains = ['socialbeta.com']
start_urls = ['http://socialbeta.com/tag/案例']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//*[@class="postimg"]/li')
for post in posts:
post_url = response.urljoin(post.xpath('div/div/h3/a/@href').extract_first())
post_title = post.xpath('div/div/h3/a/text()').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
content_text = ''.join(
response.xpath('//div[@class="content"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="content"]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
yield item
class QdailySpider(scrapy.Spider):
name = 'qdaily'
allowed_domains = ['qdaily.com']
start_urls = ['http://www.qdaily.com/categories/18.html/']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//*[@class="packery-container articles"]/div')
for post in posts:
post_url = response.urljoin(post.xpath('a/@href').extract_first())
post_title = post.xpath('a/div/div/img/@alt').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
# 文章的布局分为两种情况,根据文章中的元素做出解析(xpath)的选择。
if response.xpath('//div[@class="main long-article"]'):
content_text = ''.join(
response.xpath('//div[@class="main long-article"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="main long-article"]').extract_first()
else:
content_text = ''.join(
response.xpath('//div[@class="detail"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="detail"]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
yield item
class JiemianSpider(scrapy.Spider):
name = 'jiemian'
start_urls = ['http://www.jiemian.com/lists/49.html']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//*[@id="load-list"]/div')
for post in posts:
post_url = response.urljoin(
post.xpath('div[@class="news-right"]/div[@class="news-header"]/h3/a/@href').extract_first())
post_title = post.xpath('div[@class="news-right"]/div[@class="news-header"]/h3/a/text()').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
content_text = ''.join(
response.xpath('//div[@class="article-main"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="article-main"]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
yield item
class ToodaylabSpider(scrapy.Spider):
name = 'toodaylab'
start_urls = ['http://www.toodaylab.com/field/308']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//*[@class="content"]/div')
for post in posts:
post_url = response.urljoin(post.xpath('div[@class="post-info"]/p/a/@href').extract_first())
post_title = post.xpath('div[@class="post-info"]/p/a/text()').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
content_text = ''.join(
response.xpath('//div[@class="post-content"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="post-content"]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
yield item
class MadisonboomSpider(scrapy.Spider):
name = 'madisonboom'
start_urls = ['http://www.madisonboom.com/category/works/']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//*[@id="gallery_list_elements"]/li')
for post in posts:
post_url = response.urljoin(post.xpath('h3/a/@href').extract_first())
post_title = post.xpath('h3/p/a/text()').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
content_text = ''.join(
response.xpath('//div[@class="slide-info"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="slide-info"]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
yield item
class IwebadSpider(scrapy.Spider):
name = 'iwebad'
start_urls = ['http://iwebad.com/']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//*[@class="new_search_works"]/div')
for post in posts:
post_url = response.urljoin(post.xpath('div[@class="works_info"]/h4/span/a/@href').extract_first())
post_title = post.xpath('div[@class="works_info"]/h4/span/a/text()').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
content_text = ''.join(
response.xpath('//div[@class="news_ckkk "]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="news_ckkk "]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
yield item
class AdquanSpider(scrapy.Spider):
name = 'adquan'
start_urls = ['http://www.adquan.com/']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//*[@class="work_list_left"]/div')
for post in posts:
post_url = response.urljoin(post.xpath('h2/a/@href').extract_first())
post_title = post.xpath('h2/a/text()').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
content_text = ''.join(
response.xpath('//div[@class="deta_inner"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="deta_inner"]').extract_first()
if not content_text:
content_text = ''.join(
response.xpath('//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="con_Text"]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
yield item
class DigitalingSpider(scrapy.Spider):
name = 'digitaling'
start_urls = ['http://www.digitaling.com/projects']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//div[@id="pro_list"]/div')
for post in posts:
post_url = response.urljoin(post.xpath('div[@class="works_bd"]/div/h3/a/@href').extract_first())
post_title = post.xpath('div[@class="works_bd"]/div/h3/a/text()').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
content_text = ''.join(
response.xpath('//div[@id="article_con"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@id="article_con"]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
yield item
class IresearchSpider(scrapy.Spider):
name = 'iresearch'
start_urls = ['http://a.iresearch.cn/']
custom_settings = {
'ITEM_PIPELINES': {
# 'datapark.pipelines.BrandMongoPipeline': 300,
# 'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//div[@id="tab-list"]/div/ul/li')
for post in posts:
post_url = response.urljoin(post.xpath('h3/a/@href').extract_first())
post_title = post.xpath('h3/a/text()').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
content_text = ''.join(
response.xpath('//div[@class="m-article"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="m-article"]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
print item['content_text'], 'content_text'
print item['content_html']
class EbrunSpider(scrapy.Spider):
name = 'ebrun'
start_urls = ['http://www.ebrun.com/brands/']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//div[@id="create10"]/div/div')
for post in posts:
post_url = response.urljoin(post.xpath('p/span/a/@href').extract_first())
post_title = post.xpath('p/span/a/text()').extract_first()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
content_text = ''.join(
response.xpath('//div[@class="clearfix cmsDiv"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract())
content_html = response.xpath('//div[@class="clearfix cmsDiv"]').extract_first()
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '')
item['content_html'] = content_html
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'commerce'
item['module'] = 'brand'
yield item
#################################### conferencespider ###########################
class Eshow365Spider(scrapy.Spider):
name = 'eshow365'
start_urls = ['http://www.eshow365.com/zhanhui/0-0-0-0/0/%E5%B9%BF%E5%91%8A%20%E8%90%A5%E9%94%80']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//div[@class="sslist"]')
for post in posts:
post_url = response.urljoin(post.xpath('p[@class="zhtitle"]/a/@href').extract_first())
post_title = post.xpath('p[@class="zhtitle"]/a//text()').extract()
post_title = ''.join(post_title).strip()
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
request = scrapy.Request(url=post_url, callback=self.parse_post)
request.meta['item'] = item
yield request
def parse_post(self, response):
item = response.meta['item']
ps = response.xpath('//div[@class="zhxxcontent"]/p')
conference_time = ''
conference_address = ''
for index, p in enumerate(ps):
txt = p.xpath('string(.)').extract_first()
if u'举办时间' in txt:
conference_time = txt.split(u'举办时间:')[-1]
if u'举办展馆' in txt:
conference_address = txt.split(u'举办展馆:')[-1]
item['conference_time'] = conference_time
item['conference_address'] = conference_address
item['crawl_time'] = int(time.time())
item['site_name'] = self.name
item['type'] = 'conference'
item['module'] = 'brand'
yield item
class Events_ireasearchSpider(scrapy.Spider):
name = 'events_ireasearch'
start_urls = ['http://events.iresearch.cn/']
custom_settings = {
'ITEM_PIPELINES': {
'datapark.pipelines.BrandMongoPipeline': 300,
'datapark.pipelines.BrandKafkaPipeline': 301,
}
}
# 自定义属性
first_url = ''
connection = sqlite3.connect('data.sqlite')
cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name)
connection.commit()
cursor.execute('SELECT latest_url FROM %s' % name)
latest_url = cursor.fetchone()
if latest_url:
latest_url = latest_url[0]
def parse(self, response):
posts = response.xpath('//*[@id="databox"]/li')
for post in posts:
post_url = response.urljoin(post.xpath('div[@class="info"]/h3/a/@href').extract_first())
post_title = post.xpath('div[@class="info"]/h3/a/text()').extract_first()
conference_info = post.xpath('div[@class="info"]/p/text()').extract_first()
conference_time = conference_info.split(' ')[0]
conference_address = conference_info.split(' ')[-1]
item = {
'_id': post_url,
'post_url': post_url,
'post_title': post_title,
'conference_time': conference_time,
'conference_address': conference_address,
'crawl_time': int(time.time()),
'site_name': self.name,
'type': 'conference',
'module': 'brand',
}
# 把第一条数据作为最新的数据,存储到sqlite中
if not self.first_url:
self.first_url = post_url
self.cursor.execute('DELETE FROM %s' % self.name)
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url))
self.connection.commit()
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完
if post_url == self.latest_url:
print '%s - 爬到了上次爬到的地方' % self.name
self.connection.close()
return
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment