stav/gist:5137869

## gistfile1.py
from PyQt4.QtCore import QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.http import HtmlResponse

class Render(QWebPage):
    def __init__(self, url):
        self.app = QApplication([url])
        QWebPage.__init__(self)
        self.loadFinished.connect(self._load_finished)
        self.mainFrame().load(QUrl(url))
        self.app.exec_()

    def _load_finished(self, result):
        self.app.quit()

    @property
    def html(self):
        return self.mainFrame().toHtml().toAscii().data()

class WebkitSpider(BaseSpider):
    name = 'webkit'
    allowed_domains = ['expedia.be']
    start_urls = ('http://www.expedia.be/Hotel-Search#regionId=601725',)

    def parse(self, response):
        url = response.request.url
        self.log('parse(): rendering %s' % url)

        xpath = ('id("searchResultsContainer")/div/'
                 'div[contains(@class,"hotels")]//a[@class="title"]')
        le = SgmlLinkExtractor(restrict_xpaths=xpath)
        r = HtmlResponse(url, body=Render(url).html)
        hotels = le.extract_links(r)
        self.log('parse(): found %d hotels' % len(hotels))
	from PyQt4.QtCore import QUrl
	from PyQt4.QtGui import QApplication
	from PyQt4.QtWebKit import QWebPage

	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.spider import BaseSpider
	from scrapy.http import HtmlResponse

	class Render(QWebPage):
	def __init__(self, url):
	self.app = QApplication([url])
	QWebPage.__init__(self)
	self.loadFinished.connect(self._load_finished)
	self.mainFrame().load(QUrl(url))
	self.app.exec_()

	def _load_finished(self, result):
	self.app.quit()

	@property
	def html(self):
	return self.mainFrame().toHtml().toAscii().data()

	class WebkitSpider(BaseSpider):
	name = 'webkit'
	allowed_domains = ['expedia.be']
	start_urls = ('http://www.expedia.be/Hotel-Search#regionId=601725',)

	def parse(self, response):
	url = response.request.url
	self.log('parse(): rendering %s' % url)

	xpath = ('id("searchResultsContainer")/div/'
	'div[contains(@class,"hotels")]//a[@class="title"]')
	le = SgmlLinkExtractor(restrict_xpaths=xpath)
	r = HtmlResponse(url, body=Render(url).html)
	hotels = le.extract_links(r)
	self.log('parse(): found %d hotels' % len(hotels))