Skip to content

Instantly share code, notes, and snippets.

@polikeiji
Last active February 24, 2016 03:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save polikeiji/44358e4732c61827b08e to your computer and use it in GitHub Desktop.
Save polikeiji/44358e4732c61827b08e to your computer and use it in GitHub Desktop.
PythonでJavaScriptを使ったWebサイトをスクレイピングする ref: http://qiita.com/polikeiji/items/94062c1d9ef2f86a0c27
# -*- coding: utf-8 -*-
import os.path
from urlparse import urlparse
import arrow
from scrapy.http import HtmlResponse
from selenium.webdriver import Firefox
driver = Firefox()
class SeleniumMiddleware(object):
def process_request(self, request, spider):
driver.get(request.url)
return HtmlResponse(driver.current_url,
body = driver.page_source,
encoding = 'utf-8',
request = request)
def close_driver():
driver.close()
# -*- coding: utf-8 -*-
import scrapy
from ..selenium_middleware import close_driver
class SomeSpider(scrapy.Spider):
name = "some_spider"
allowed_domains = ["somedomain"]
start_urls = (
'http://somedomain/',
)
custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
"some_crawler.selenium_middleware.SeleniumMiddleware": 0,
},
"DOWNLOAD_DELAY": 0.5,
}
def parse(self, response):
# クローラーの処理
def closed(self, reason):
close_driver()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment