Last active
April 23, 2020 04:05
-
-
Save tamonoki/5a0bd74e14dbb2df9b2f to your computer and use it in GitHub Desktop.
scrapy を用いてデータを収集し、mongoDB に投入する ref: http://qiita.com/tamonoki/items/ce58ff209f8eae808162
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ scrapy crawl WebspiderSpider |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class WebItem(scrapy.Item): | |
title = scrapy.Field() | |
link = scrapy.Field() | |
date = scrapy.Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pymongo import MongoClient # mongoDB との接続 | |
import datetime | |
class TutorialPipeline(object): | |
def __init__(self, mongo_uri, mongo_db, mongolab_user, mongolab_pass): | |
# インスタンス生成時に渡された引数で、変数初期化 | |
self.mongo_uri = mongo_uri | |
self.mongo_db = mongo_db | |
self.mongolab_user = mongolab_user | |
self.mongolab_pass = mongolab_pass | |
@classmethod # 引数にクラスがあるので、クラス変数にアクセスできる | |
def from_crawler(cls, crawler): | |
return cls( | |
mongo_uri=crawler.settings.get('MONGO_URI'), # settings.py て定義した変数にアクセスする | |
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items'), | |
mongolab_user=crawler.settings.get('MONGOLAB_USER'), | |
mongolab_pass=crawler.settings.get('MONGOLAB_PASS') | |
) # def __init__ の引数になる | |
def open_spider(self, spider): # スパイダー開始時に実行される。データベース接続 | |
self.client = MongoClient(self.mongo_uri) | |
self.db = self.client[self.mongo_db] | |
self.db.authenticate(self.mongolab_user, self.mongolab_pass) | |
def close_spider(self, spider): # スパイダー終了時に実行される。データベース接続を閉じる | |
self.client.close() | |
def process_item(self, item, spider): | |
self.db[self.collection_name].update( | |
{u'link': item['link']}, | |
{"$set": dict(item)}, | |
upsert = True | |
) # linkを検索して、なければ新規作成、あればアップデートする | |
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REDIRECT_MAX_TIMES = 6 | |
RETRY_ENABLED = False | |
DOWNLOAD_DELAY=10 | |
COOKIES_ENABLED=False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tutorial/ | |
scrapy.cfg # deploy configuration file | |
tutorial/ # project's Python module, you'll import your code from here | |
__init__.py | |
items.py # project items file | |
pipelines.py # project pipelines file | |
settings.py # project settings file | |
spiders/ # a directory where you'll later put your spiders | |
__init__.py | |
... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from tutorial.items import WebItem | |
import re | |
import datetime | |
from scrapy.spiders import CrawlSpider, Rule | |
from scrapy.linkextractors import LinkExtractor | |
class WebspiderSpider(CrawlSpider): #クラス名にたいした意味はない | |
name = 'WebspiderSpider' # これは重要。この名前を指定してスパイダー(クローラー)を動かす | |
allowed_domains = ['example.com'] | |
start_urls = ['http://www.example.com'] | |
xpath = { | |
'title' : "//title/text()", | |
} | |
list_allow = [r'(正規表現)'] #この条件に合うリンクは巡回 | |
list_deny = [ | |
r'/exsample/hogehoge/hoge/', # こちらは巡回しないリンクの指定例。リスト表記も可能 | |
] | |
list_allow_parse = [r'(正規表現)'] #データ抽出するリンク指定 | |
list_deny_parse = [ #データ抽出しないリンク指定 | |
r'(正規表現)', | |
r'(正規表現)', | |
] | |
rules = ( | |
# 巡回ルール。 | |
Rule(LinkExtractor( | |
allow=list_allow, | |
deny=list_deny, | |
), | |
follow=True # そのリンクへ入っていく | |
), | |
# データ抽出ルール | |
Rule(LinkExtractor( | |
allow=list_allow_parse, | |
deny=list_deny_parse, | |
unique=True # おなじリンク先ではデータ抽出しない | |
), | |
callback='parse_items' # 条件に合えば、ここで指定したデータ抽出実行関数を実行する。 | |
), | |
) | |
#データ抽出関数定義 | |
def parse_items(self, response): # response に、ウェブサイトの情報が入っている | |
item = WebItem() # items.pyで指定したクラス | |
item['title'] = response.xpath(self.xpath['title']).extract()[0] | |
item['link'] = response.url | |
item['date'] = datetime.datetime.utcnow() + datetime.timedelta(hours=9) # 現在時間。日本時間にして突っ込む。 | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment