Created
October 11, 2013 18:28
-
-
Save stelonix/6939677 to your computer and use it in GitHub Desktop.
Dubstep.net site ripper. Works by scrapping, which might be illegal to use in some countries.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# All the configuration is done on the file | |
# It attempts to download tracks based on id, which I assume it's automatically incremented | |
# Very naive but it works | |
# Some of the code is based on code found in stackoverflow.com, all make it obvious in another revision | |
# TODO: Make the progress display better | |
import urllib2, lxml.html, traceback | |
from lxml.cssselect import CSSSelector | |
from sys import stdout | |
base_url = "http://www.dubstep.net/" | |
path_suburl = "track/" | |
format_string = "%d" | |
css_selector = ".track_type > a" | |
data_dir = "/mnt/home/dubstep.net/" | |
def retrieve_with_referer(url,referer=base_url): | |
print url | |
try: | |
req = urllib2.Request(url) | |
req.add_header('Referer', base_url) | |
return urllib2.urlopen(req) | |
except IOError,e: | |
print e | |
return None | |
except Exception: | |
traceback.print_exc() | |
def sizeof_fmt(num): | |
for x in ['bytes','KB','MB','GB']: | |
if num < 1024.0 and num > -1024.0: | |
return "%3.1f%s" % (num, x) | |
num /= 1024.0 | |
return "%3.1f%s" % (num, 'TB') | |
def get_filename(header): | |
fn_header = "Content-Disposition: attachment;filename=" | |
filename_pos = len(fn_header) | |
if header.find(fn_header) != -1: | |
return header[filename_pos+1:].strip()[:-1] | |
def fail(track_id): | |
print "Track %d does not exist"%track_id | |
def download_file(r, targ_file): | |
print "Downloading: %s to %s" % (r.url, targ_file) | |
with open(targ_file, 'wb') as f: | |
meta_stuff = r.info() | |
meta_func = meta_stuff.getheaders if hasattr(meta_stuff, 'getheaders') else meta_stuff.get_all | |
meta_length = meta_func("Content-Length") | |
file_size = None | |
if meta_length: | |
file_size = float(meta_length[0]) | |
file_size_dl = 0 | |
block_sz = 8192 | |
lowest_dif = 0 | |
while True: | |
buffer = r.read(block_sz) | |
if not buffer: | |
break | |
file_size_dl += len(buffer) | |
f.write(buffer) | |
if file_size: | |
progress_string = "Progress: %s/%s (%.2f%%)" % (sizeof_fmt(file_size_dl), sizeof_fmt(file_size), (file_size_dl/file_size)*100) | |
else: | |
progress_string = "Progress: %s/unknown" % sizeof_fmt(file_size_dl) | |
cur_len = len(progress_string) | |
if file_size_dl <= 8192: | |
last_len = len(progress_string) | |
len_dif = cur_len - last_len | |
if len_dif < 0: | |
if len_dif < lowest_dif: | |
lowest_dif = len_dif | |
progress_string += " "*(lowest_dif) | |
stdout.write("\r%s" % progress_string) | |
stdout.flush() | |
stdout.write("\b "*(abs(lowest_dif)+1)) | |
stdout.flush() | |
print "\nDownload complete." | |
return targ_file | |
for i in range(1,5040): | |
targ_url = base_url + path_suburl + (format_string % i) | |
print "Checking %s" % targ_url | |
song_req = retrieve_with_referer(targ_url) | |
if song_req == None: | |
fail(i) | |
continue | |
print "Page downloaded" | |
page = lxml.html.fromstring(song_req.read()) | |
download_div = CSSSelector(css_selector)(page) | |
try: | |
if download_div[0].text == "Download": | |
targ_download = download_div[0].attrib["href"] | |
download_request = retrieve_with_referer(targ_download, targ_url) | |
targ_filename = "%d.mp3"%i | |
for header in download_request.info().headers: | |
if get_filename(header) != None: | |
targ_filename = get_filename(header) | |
break | |
download_file(download_request, data_dir+targ_filename) | |
else: | |
fail(i) | |
except Exception,e: | |
traceback.print_exc() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment