Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@hydrogen18
Created December 30, 2013 18:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hydrogen18/8185934 to your computer and use it in GitHub Desktop.
Save hydrogen18/8185934 to your computer and use it in GitHub Desktop.
Python script to download 30c3 audio & video files
from HTMLParser import HTMLParser
import urllib2
import subprocess
import re
import os
SAVE_DIR = os.path.expanduser("~/30c3")
FILE_RE = re.compile("30c3-\d{4}-.*")
class DirListingParser(HTMLParser):
def init(self):
self.anchorIsDir = False
self.anchorIsFile = False
self.dirs = []
self.files = []
def handle_starttag(self,tag,attrs):
attrs = dict((k,v) for k,v in attrs)
if tag == "img":
altText = attrs.get('alt')
if altText == "[DIR]":
self.anchorIsDir = True
elif tag == "a":
if self.anchorIsDir:
self.anchorIsDir = False
href = attrs.get('href')
print href
self.dirs.append(href)
else:
href = attrs.get('href')
if None != FILE_RE.match(href):
print file
self.files.append(href)
#def handle_endtag(self,tag):
ROOT_URL = "http://ftp.ccc.de/congress/2013/"
def download_one(filename,fileurl):
print fileurl
cmd = ['wget','-v','-t','3','-c','-O',os.path.join(SAVE_DIR,filename),fileurl]
print ' '.join(cmd)
retcode = subprocess.check_call(cmd)
def download_all(dirurl):
print dirurl
response = urllib2.urlopen(dirurl)
body = response.read()
parser = DirListingParser()
parser.init()
parser.feed(body)
for dirname in parser.dirs:
download_all(dirurl + dirname)
for filename in parser.files:
fileurl = dirurl + filename
download_one(filename,fileurl)
os.makedirs(SAVE_DIR)
download_all(ROOT_URL)
@graingert
Copy link

This is somewhat pointless, as you might as well use http://ftp.ccc.de/INDEX and a glob to wget all the files you need.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment