Last active
February 24, 2016 03:27
-
-
Save polikeiji/44358e4732c61827b08e to your computer and use it in GitHub Desktop.
PythonでJavaScriptを使ったWebサイトをスクレイピングする ref: http://qiita.com/polikeiji/items/94062c1d9ef2f86a0c27
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os.path | |
from urlparse import urlparse | |
import arrow | |
from scrapy.http import HtmlResponse | |
from selenium.webdriver import Firefox | |
driver = Firefox() | |
class SeleniumMiddleware(object): | |
def process_request(self, request, spider): | |
driver.get(request.url) | |
return HtmlResponse(driver.current_url, | |
body = driver.page_source, | |
encoding = 'utf-8', | |
request = request) | |
def close_driver(): | |
driver.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from ..selenium_middleware import close_driver | |
class SomeSpider(scrapy.Spider): | |
name = "some_spider" | |
allowed_domains = ["somedomain"] | |
start_urls = ( | |
'http://somedomain/', | |
) | |
custom_settings = { | |
"DOWNLOADER_MIDDLEWARES": { | |
"some_crawler.selenium_middleware.SeleniumMiddleware": 0, | |
}, | |
"DOWNLOAD_DELAY": 0.5, | |
} | |
def parse(self, response): | |
# クローラーの処理 | |
def closed(self, reason): | |
close_driver() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment