tamonoki/commandline

## commandline
$ scrapy crawl WebspiderSpider

## items.py
import scrapy

class WebItem(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    date = scrapy.Field()

## pipelines.py
from pymongo import MongoClient  # mongoDB との接続
import datetime

class TutorialPipeline(object):

    def __init__(self, mongo_uri, mongo_db, mongolab_user, mongolab_pass):
        # インスタンス生成時に渡された引数で、変数初期化
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.mongolab_user = mongolab_user
        self.mongolab_pass = mongolab_pass

    @classmethod  # 引数にクラスがあるので、クラス変数にアクセスできる
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'), # settings.py て定義した変数にアクセスする
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items'),
            mongolab_user=crawler.settings.get('MONGOLAB_USER'),
            mongolab_pass=crawler.settings.get('MONGOLAB_PASS')
        ) # def __init__ の引数になる

    def open_spider(self, spider): # スパイダー開始時に実行される。データベース接続
        self.client = MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]
        self.db.authenticate(self.mongolab_user, self.mongolab_pass)

    def close_spider(self, spider): # スパイダー終了時に実行される。データベース接続を閉じる
        self.client.close()

    def process_item(self, item, spider):
        self.db[self.collection_name].update(
            {u'link': item['link']},
            {"$set": dict(item)},
            upsert = True
        ) # linkを検索して、なければ新規作成、あればアップデートする

        return item

## settings.py
REDIRECT_MAX_TIMES = 6
RETRY_ENABLED = False
DOWNLOAD_DELAY=10
COOKIES_ENABLED=False

## tutorial
tutorial/
    scrapy.cfg            # deploy configuration file

    tutorial/             # project's Python module, you'll import your code from here
        __init__.py

        items.py          # project items file

        pipelines.py      # project pipelines file

        settings.py       # project settings file

        spiders/          # a directory where you'll later put your spiders
            __init__.py
            ...

## webspider.py
# -*- coding: utf-8 -*-
import scrapy
from tutorial.items import WebItem
import re
import datetime
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class WebspiderSpider(CrawlSpider):  #クラス名にたいした意味はない
    name = 'WebspiderSpider'  # これは重要。この名前を指定してスパイダー(クローラー)を動かす
    allowed_domains = ['example.com']
    start_urls = ['http://www.example.com']

    xpath = {
        'title' : "//title/text()",
    }

    list_allow = [r'(正規表現)'] #この条件に合うリンクは巡回
    list_deny = [
                r'/exsample/hogehoge/hoge/', # こちらは巡回しないリンクの指定例。リスト表記も可能
            ]
    list_allow_parse = [r'(正規表現)']  #データ抽出するリンク指定
    list_deny_parse = [                #データ抽出しないリンク指定
                r'(正規表現)',
                r'(正規表現)',
                ]

    rules = (
        # 巡回ルール。
        Rule(LinkExtractor(
            allow=list_allow,
            deny=list_deny,
            ),
            follow=True # そのリンクへ入っていく
        ),
        # データ抽出ルール
        Rule(LinkExtractor(
            allow=list_allow_parse,
            deny=list_deny_parse,
            unique=True # おなじリンク先ではデータ抽出しない
            ),
            callback='parse_items' # 条件に合えば、ここで指定したデータ抽出実行関数を実行する。
        ),
    )

   #データ抽出関数定義
   def parse_items(self, response): # response に、ウェブサイトの情報が入っている
        item = WebItem()  # items.pyで指定したクラス
        item['title'] = response.xpath(self.xpath['title']).extract()[0]
        item['link'] = response.url
        item['date'] = datetime.datetime.utcnow() + datetime.timedelta(hours=9) # 現在時間。日本時間にして突っ込む。

        yield item
	import scrapy

	class WebItem(scrapy.Item):
	title = scrapy.Field()
	link = scrapy.Field()
	date = scrapy.Field()
	from pymongo import MongoClient # mongoDB との接続
	import datetime

	class TutorialPipeline(object):

	def __init__(self, mongo_uri, mongo_db, mongolab_user, mongolab_pass):
	# インスタンス生成時に渡された引数で、変数初期化
	self.mongo_uri = mongo_uri
	self.mongo_db = mongo_db
	self.mongolab_user = mongolab_user
	self.mongolab_pass = mongolab_pass

	@classmethod # 引数にクラスがあるので、クラス変数にアクセスできる
	def from_crawler(cls, crawler):
	return cls(
	mongo_uri=crawler.settings.get('MONGO_URI'), # settings.py て定義した変数にアクセスする
	mongo_db=crawler.settings.get('MONGO_DATABASE', 'items'),
	mongolab_user=crawler.settings.get('MONGOLAB_USER'),
	mongolab_pass=crawler.settings.get('MONGOLAB_PASS')
	) # def __init__ の引数になる

	def open_spider(self, spider): # スパイダー開始時に実行される。データベース接続
	self.client = MongoClient(self.mongo_uri)
	self.db = self.client[self.mongo_db]
	self.db.authenticate(self.mongolab_user, self.mongolab_pass)

	def close_spider(self, spider): # スパイダー終了時に実行される。データベース接続を閉じる
	self.client.close()

	def process_item(self, item, spider):
	self.db[self.collection_name].update(
	{u'link': item['link']},
	{"$set": dict(item)},
	upsert = True
	) # linkを検索して、なければ新規作成、あればアップデートする

	return item
	REDIRECT_MAX_TIMES = 6
	RETRY_ENABLED = False
	DOWNLOAD_DELAY=10
	COOKIES_ENABLED=False
	tutorial/
	scrapy.cfg # deploy configuration file

	tutorial/ # project's Python module, you'll import your code from here
	__init__.py

	items.py # project items file

	pipelines.py # project pipelines file

	settings.py # project settings file

	spiders/ # a directory where you'll later put your spiders
	__init__.py
	...
	# -- coding: utf-8 --
	import scrapy
	from tutorial.items import WebItem
	import re
	import datetime
	from scrapy.spiders import CrawlSpider, Rule
	from scrapy.linkextractors import LinkExtractor

	class WebspiderSpider(CrawlSpider): #クラス名にたいした意味はない
	name = 'WebspiderSpider' # これは重要。この名前を指定してスパイダー(クローラー)を動かす
	allowed_domains = ['example.com']
	start_urls = ['http://www.example.com']

	xpath = {
	'title' : "//title/text()",
	}

	list_allow = [r'(正規表現)'] #この条件に合うリンクは巡回
	list_deny = [
	r'/exsample/hogehoge/hoge/', # こちらは巡回しないリンクの指定例。リスト表記も可能
	]
	list_allow_parse = [r'(正規表現)'] #データ抽出するリンク指定
	list_deny_parse = [ #データ抽出しないリンク指定
	r'(正規表現)',
	r'(正規表現)',
	]

	rules = (
	# 巡回ルール。
	Rule(LinkExtractor(
	allow=list_allow,
	deny=list_deny,
	),
	follow=True # そのリンクへ入っていく
	),
	# データ抽出ルール
	Rule(LinkExtractor(
	allow=list_allow_parse,
	deny=list_deny_parse,
	unique=True # おなじリンク先ではデータ抽出しない
	),
	callback='parse_items' # 条件に合えば、ここで指定したデータ抽出実行関数を実行する。
	),
	)

	#データ抽出関数定義
	def parse_items(self, response): # response に、ウェブサイトの情報が入っている
	item = WebItem() # items.pyで指定したクラス
	item['title'] = response.xpath(self.xpath['title']).extract()[0]
	item['link'] = response.url
	item['date'] = datetime.datetime.utcnow() + datetime.timedelta(hours=9) # 現在時間。日本時間にして突っ込む。

	yield item