Skip to content

Instantly share code, notes, and snippets.

@stelonix
Created October 11, 2013 18:28
Show Gist options
  • Save stelonix/6939677 to your computer and use it in GitHub Desktop.
Save stelonix/6939677 to your computer and use it in GitHub Desktop.
Dubstep.net site ripper. Works by scrapping, which might be illegal to use in some countries.
# All the configuration is done on the file
# It attempts to download tracks based on id, which I assume it's automatically incremented
# Very naive but it works
# Some of the code is based on code found in stackoverflow.com, all make it obvious in another revision
# TODO: Make the progress display better
import urllib2, lxml.html, traceback
from lxml.cssselect import CSSSelector
from sys import stdout
base_url = "http://www.dubstep.net/"
path_suburl = "track/"
format_string = "%d"
css_selector = ".track_type > a"
data_dir = "/mnt/home/dubstep.net/"
def retrieve_with_referer(url,referer=base_url):
print url
try:
req = urllib2.Request(url)
req.add_header('Referer', base_url)
return urllib2.urlopen(req)
except IOError,e:
print e
return None
except Exception:
traceback.print_exc()
def sizeof_fmt(num):
for x in ['bytes','KB','MB','GB']:
if num < 1024.0 and num > -1024.0:
return "%3.1f%s" % (num, x)
num /= 1024.0
return "%3.1f%s" % (num, 'TB')
def get_filename(header):
fn_header = "Content-Disposition: attachment;filename="
filename_pos = len(fn_header)
if header.find(fn_header) != -1:
return header[filename_pos+1:].strip()[:-1]
def fail(track_id):
print "Track %d does not exist"%track_id
def download_file(r, targ_file):
print "Downloading: %s to %s" % (r.url, targ_file)
with open(targ_file, 'wb') as f:
meta_stuff = r.info()
meta_func = meta_stuff.getheaders if hasattr(meta_stuff, 'getheaders') else meta_stuff.get_all
meta_length = meta_func("Content-Length")
file_size = None
if meta_length:
file_size = float(meta_length[0])
file_size_dl = 0
block_sz = 8192
lowest_dif = 0
while True:
buffer = r.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
if file_size:
progress_string = "Progress: %s/%s (%.2f%%)" % (sizeof_fmt(file_size_dl), sizeof_fmt(file_size), (file_size_dl/file_size)*100)
else:
progress_string = "Progress: %s/unknown" % sizeof_fmt(file_size_dl)
cur_len = len(progress_string)
if file_size_dl <= 8192:
last_len = len(progress_string)
len_dif = cur_len - last_len
if len_dif < 0:
if len_dif < lowest_dif:
lowest_dif = len_dif
progress_string += " "*(lowest_dif)
stdout.write("\r%s" % progress_string)
stdout.flush()
stdout.write("\b "*(abs(lowest_dif)+1))
stdout.flush()
print "\nDownload complete."
return targ_file
for i in range(1,5040):
targ_url = base_url + path_suburl + (format_string % i)
print "Checking %s" % targ_url
song_req = retrieve_with_referer(targ_url)
if song_req == None:
fail(i)
continue
print "Page downloaded"
page = lxml.html.fromstring(song_req.read())
download_div = CSSSelector(css_selector)(page)
try:
if download_div[0].text == "Download":
targ_download = download_div[0].attrib["href"]
download_request = retrieve_with_referer(targ_download, targ_url)
targ_filename = "%d.mp3"%i
for header in download_request.info().headers:
if get_filename(header) != None:
targ_filename = get_filename(header)
break
download_file(download_request, data_dir+targ_filename)
else:
fail(i)
except Exception,e:
traceback.print_exc()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment