Skip to content

Instantly share code, notes, and snippets.

@stav
Last active December 14, 2015 19:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stav/5137869 to your computer and use it in GitHub Desktop.
Save stav/5137869 to your computer and use it in GitHub Desktop.
Scrapy blocking spider that renders JavaScript with PyQt4
from PyQt4.QtCore import QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.http import HtmlResponse
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication([url])
QWebPage.__init__(self)
self.loadFinished.connect(self._load_finished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _load_finished(self, result):
self.app.quit()
@property
def html(self):
return self.mainFrame().toHtml().toAscii().data()
class WebkitSpider(BaseSpider):
name = 'webkit'
allowed_domains = ['expedia.be']
start_urls = ('http://www.expedia.be/Hotel-Search#regionId=601725',)
def parse(self, response):
url = response.request.url
self.log('parse(): rendering %s' % url)
xpath = ('id("searchResultsContainer")/div/'
'div[contains(@class,"hotels")]//a[@class="title"]')
le = SgmlLinkExtractor(restrict_xpaths=xpath)
r = HtmlResponse(url, body=Render(url).html)
hotels = le.extract_links(r)
self.log('parse(): found %d hotels' % len(hotels))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment