Skip to content

Instantly share code, notes, and snippets.

@Leonidas-from-XIV
Created October 30, 2010 12:56
Show Gist options
  • Save Leonidas-from-XIV/655272 to your computer and use it in GitHub Desktop.
Save Leonidas-from-XIV/655272 to your computer and use it in GitHub Desktop.
zeropunctuation-dl
#!/usr/bin/env python3
# A downloader for Zero Punctuation episodes. Grabs them from the internet
# and saves them with the approriate naming into the folder. The name is
# determined automatically from the web site.
# Licensed under GPLv3, fwiw.
import sys, urllib.request, re, json
# the browser that we are going to pretend we are
# yay for increasing Firefox and Linux marketshare)
user_agent = """Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101027 Firefox/3.6.12"""
# where to find the path to the config file. pretty crude, admittedly
config_re = re.compile(r'value="config=(http://www.themis-media.com/videos/config/\d*-\w*.js)')
charset_re = re.compile(r'charset=(.*)')
def construct_request(url, host, referrer=None):
"""This constructs a request that actually works. Setting User-Agent is not enough,
we also set some headers (grabbed from what Firefox 3.6.12 actually sends)."""
req = urllib.request.Request(url, headers={
'User-Agent' : user_agent,
'Host' : host,
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' : 'en-us,en;q=0.7,de;q=0.3',
'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Connection' : 'close'
})
# maybe they also want a referrer? In any case, we can send one
if referrer is not None:
req.add_header('Referer', referrer)
return req
def get_config_path(source):
"""Finds the URL of the config file on the page"""
return config_re.search(source).group(1)
def download_hook(blocks_done, blocksize, total):
"""Displays the download progress"""
print("\r {0}/{1}".format(blocks_done * blocksize, total), end='')
def main():
url = sys.argv[1]
# load the "intro" page and extract the config file
with urllib.request.urlopen(construct_request(url, 'www.escapistmagazine.com')) as page:
bytestream = page.read()
content_type = dict(page.getheaders())['Content-Type']
charset = charset_re.findall(content_type)[0]
content = bytestream.decode(charset)
config_path = get_config_path(content)
# grab the config file. this is not that easy, because the page blacklists
# "incorrect" requests, so we get BadStatusLine exception
# => take care that we pretend to be a browser good enough.
with urllib.request.urlopen(construct_request(config_path, 'www.themis-media.com', referrer=url)) as page:
# the JS file can be parsed as JSON when we change the quotes
content = page.read().decode('utf-8').replace("'", '"')
config = json.loads(content)
# URL to the video
video = config['playlist'][1]['url']
# the name of the episode
name = config['plugins']['viral']['share']['description']
# destination filename
filename = name + '.mp4'
# grab it
print("Downloading to {0}".format(filename))
urllib.request.urlretrieve(video, filename, download_hook)
print()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment