Skip to content

Instantly share code, notes, and snippets.

@mauromarano
Created June 19, 2017 09:06
Show Gist options
  • Save mauromarano/be5c9f22731ec8e360becd43ea797c44 to your computer and use it in GitHub Desktop.
Save mauromarano/be5c9f22731ec8e360becd43ea797c44 to your computer and use it in GitHub Desktop.
Get a screenshot of a webpage with python
#to install PyQt4 on mac brew install cartr/qt4/pyqt
#to remove javascripts tag from html https://stackoverflow.com/questions/8554035/remove-all-javascript-tags-and-style-tags-from-html-with-python-and-the-lxml-mod
import sys
import time
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
import lxml
from lxml.html.clean import Cleaner
def clean_js_and_css(url):
cleaner = Cleaner()
cleaner.javascript = True # This is True because we want to activate the javascript filter
return lxml.html.tostring(cleaner.clean_html(lxml.html.parse(url)))
def download_screenshot_and_html(url, name):
html = clean_js_and_css(url)
Screenshot().capture(url,name+".png")
with open(name+".html", 'w') as text:
text.write(html)
class Screenshot(QWebView):
def __init__(self):
self.app = QApplication(sys.argv)
QWebView.__init__(self)
self._loaded = False
self.loadFinished.connect(self._loadFinished)
def capture(self, url, output_file):
self.load(QUrl(url))
self.wait_load()
# set to webpage size
frame = self.page().mainFrame()
self.page().setViewportSize(frame.contentsSize())
# render image
image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
frame.render(painter)
painter.end()
print 'saving', output_file
image.save(output_file)
def wait_load(self, delay=0):
# process app events until page loaded
while not self._loaded:
self.app.processEvents()
time.sleep(delay)
self._loaded = False
def _loadFinished(self, result):
self._loaded = True
download_screenshot_and_html('http://gazzetta.it', 'gazzetta')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment