Skip to content

Instantly share code, notes, and snippets.

@djsutherland
Created July 2, 2010 20:05
Show Gist options
  • Save djsutherland/461843 to your computer and use it in GitHub Desktop.
Save djsutherland/461843 to your computer and use it in GitHub Desktop.
download mp3s from betterpropaganda.com in bulk
#!/usr/bin/env python
import lxml.html
import urllib
import Queue
import re
import sys
import threading
doc = lxml.html.parse(sys.argv[1])
links = doc.xpath('//p/a[contains(@href, "downloadSong")]')
# threads to download 5 at a time
pool = Queue.Queue()
stdout_lock = threading.Lock()
url_base = 'http://betterpropaganda.com/mp3_download.ashx?id=%s'
class Downloader(threading.Thread):
def run(self):
while True:
try:
url, file = pool.get(timeout=3)
with stdout_lock:
print "downloading %s" % file
urllib.urlretrieve(url_base % url, file)
except Queue.Empty:
break
threads = [Downloader() for i in range(5)]
# tell the threads what to download
id_re = re.compile(r'javascript:downloadSong\((\d+)\)')
esc = lambda s: s.replace('(','').replace(')','') \
.replace(' ','_').replace('/','_')
name_re = re.compile(r'^\[ play \] \| \[ download mp3 \] (.*?)\s?- "(.*)"')
for el in links:
id = id_re.match(el.get('href')).group(1)
p_text = el.getparent().text_content()
match = name_re.match(p_text)
if not match:
print "DIDN'T MATCH '%s'" % p_text
name = "unknown.mp3"
else:
name = '-'.join(map(esc, match.groups())) + '.mp3'
pool.put( (id, name) )
# do the downloading
for thread in threads: thread.start()
for thread in threads: thread.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment