Skip to content

Instantly share code, notes, and snippets.

@vitorio
Created February 1, 2016 04:59
Show Gist options
  • Save vitorio/66b3fbd9930aeb2562e5 to your computer and use it in GitHub Desktop.
Save vitorio/66b3fbd9930aeb2562e5 to your computer and use it in GitHub Desktop.
(2011) wget + moz-headless-screenshot to take screenshots of archived URLs
# Assumptions:
# moz-headless-screenshot in ./bin
# wget 1.12 in system path
# Ubuntu 10.10 defaults to Python 2.6.6 so we provide 2.7's subprocess module ourselves
# I guess we could also install 2.7
import optparse, urlparse, tempfile, subprocess271 as subprocess, os.path
parser = optparse.OptionParser()
options, args = parser.parse_args()
if args and args[0]:
url = urlparse.urlparse(args[0]).geturl()
if url:
useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2; rv:1.9.2) Gecko/20100101 Firefox/3.6'
downloaddir = tempfile.mkdtemp()
output = subprocess.check_output(['wget',
'--page-requisites',
'--span-hosts',
'--convert-links',
'--wait=1',
'--random-wait',
'--force-directories',
'--adjust-extension',
'--no-verbose',
'--execute', 'robots=off',
'--user-agent="%s"' % useragent,
'--restrict-file-names=ascii',
'--directory-prefix=%s' % downloaddir,
url],
stderr=subprocess.STDOUT)
# 2011-03-30 07:23:52 URL:http://vi.to/ [8924/8924] -> "vi.to/index.html" [1]
# is this more or less brittle than the urlbot regex?
# m|\d\d:\d\d:\d\d URL:.+ \[\d+/?\d+\] -> \"([^\"]+)\" \[\d+\]|
# not that I know how to turn that into Python code
indexfile = output.splitlines()[0].split('->')[1].split()[0].strip('"')
savedpage = os.path.join(downloaddir, indexfile)
thumbnail = tempfile.mkstemp('.png')
os.close(thumbnail[0])
# moz-headless-screenshot segfaults unless you run it from within bin
previousdir = os.getcwd()
os.chdir('bin')
try:
output = subprocess.check_output(['./moz-headless-screenshot',
savedpage,
'1024',
'768',
thumbnail[1]])
except subprocess.CalledProcessError, e:
if e.returncode is -11:
pass
os.chdir(previousdir)
print indexfile
print thumbnail[1]
# need to shutil's delete the temp dir and PNG after we do something with them
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment