Skip to content

Instantly share code, notes, and snippets.

@phaustin
Created November 17, 2016 23:14
Show Gist options
  • Save phaustin/03ad3057e9bdf6c8a6c722f6aca088c0 to your computer and use it in GitHub Desktop.
Save phaustin/03ad3057e9bdf6c8a6c722f6aca088c0 to your computer and use it in GitHub Desktop.
qt web scraper
#http://stackoverflow.com/questions/21274865/scrape-multiple-urls-using-qwebpage
import sys
from PyQt4 import QtCore, QtGui, QtWebKit
class WebPage(QtWebKit.QWebPage):
def __init__(self):
super(WebPage, self).__init__()
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QtCore.QUrl(url))
return True
def processCurrentPage(self):
url = self.mainFrame().url().toString()
html = self.mainFrame().toHtml()
# do stuff with html...
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QtGui.qApp.quit()
if __name__ == '__main__':
# generate some test urls
urls = []
url = 'http://pyqt.sourceforge.net/Docs/PyQt4/%s.html'
for name in dir(QtWebKit):
if name.startswith('Q'):
urls.append(url % name.lower())
app = QtGui.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
sys.exit(app.exec_())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment