Last active
December 14, 2015 19:39
-
-
Save stav/5137869 to your computer and use it in GitHub Desktop.
Scrapy blocking spider that renders JavaScript with PyQt4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyQt4.QtCore import QUrl | |
from PyQt4.QtGui import QApplication | |
from PyQt4.QtWebKit import QWebPage | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.spider import BaseSpider | |
from scrapy.http import HtmlResponse | |
class Render(QWebPage): | |
def __init__(self, url): | |
self.app = QApplication([url]) | |
QWebPage.__init__(self) | |
self.loadFinished.connect(self._load_finished) | |
self.mainFrame().load(QUrl(url)) | |
self.app.exec_() | |
def _load_finished(self, result): | |
self.app.quit() | |
@property | |
def html(self): | |
return self.mainFrame().toHtml().toAscii().data() | |
class WebkitSpider(BaseSpider): | |
name = 'webkit' | |
allowed_domains = ['expedia.be'] | |
start_urls = ('http://www.expedia.be/Hotel-Search#regionId=601725',) | |
def parse(self, response): | |
url = response.request.url | |
self.log('parse(): rendering %s' % url) | |
xpath = ('id("searchResultsContainer")/div/' | |
'div[contains(@class,"hotels")]//a[@class="title"]') | |
le = SgmlLinkExtractor(restrict_xpaths=xpath) | |
r = HtmlResponse(url, body=Render(url).html) | |
hotels = le.extract_links(r) | |
self.log('parse(): found %d hotels' % len(hotels)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment