stelonix/dubstep.net_rip.py

## dubstep.net_rip.py
# All the configuration is done on the file
# It attempts to download tracks based on id, which I assume it's automatically incremented
# Very naive but it works
# Some of the code is based on code found in stackoverflow.com, all make it obvious in another revision
# TODO: Make the progress display better

import urllib2, lxml.html, traceback
from lxml.cssselect import CSSSelector
from sys import stdout

base_url = "http://www.dubstep.net/"
path_suburl = "track/"
format_string = "%d"
css_selector = ".track_type > a"
data_dir = "/mnt/home/dubstep.net/"

def retrieve_with_referer(url,referer=base_url):
	print url
	try:
		req = urllib2.Request(url)
		req.add_header('Referer', base_url)
		return urllib2.urlopen(req)
	except IOError,e:
		print e
		return None
	except Exception:
		traceback.print_exc()
def sizeof_fmt(num):
	for x in ['bytes','KB','MB','GB']:
		if num < 1024.0 and num > -1024.0:
			return "%3.1f%s" % (num, x)
		num /= 1024.0
	return "%3.1f%s" % (num, 'TB')


def get_filename(header):
	fn_header = "Content-Disposition: attachment;filename="
	filename_pos = len(fn_header)
	if header.find(fn_header) != -1:
		return header[filename_pos+1:].strip()[:-1]

def fail(track_id):
	print "Track %d does not exist"%track_id

def download_file(r, targ_file):
	print "Downloading: %s to %s" % (r.url, targ_file)
	with open(targ_file, 'wb') as f:
		meta_stuff = r.info()
		meta_func = meta_stuff.getheaders if hasattr(meta_stuff, 'getheaders') else meta_stuff.get_all
		meta_length = meta_func("Content-Length")
		file_size = None
		if meta_length:
			file_size = float(meta_length[0])
		file_size_dl = 0
		block_sz = 8192
		lowest_dif = 0
		while True:
			buffer = r.read(block_sz)
			if not buffer:
				break

			file_size_dl += len(buffer)
			f.write(buffer)

			if file_size:
				progress_string = "Progress: %s/%s (%.2f%%)" % (sizeof_fmt(file_size_dl), sizeof_fmt(file_size), (file_size_dl/file_size)*100)
			else:
				progress_string = "Progress: %s/unknown" % sizeof_fmt(file_size_dl)
			cur_len = len(progress_string)
			if file_size_dl <= 8192:
				last_len = len(progress_string)
			len_dif = cur_len - last_len
			if len_dif < 0:
				if len_dif < lowest_dif:
					lowest_dif = len_dif
				progress_string += " "*(lowest_dif)

			stdout.write("\r%s" % progress_string)

			stdout.flush()
			stdout.write("\b "*(abs(lowest_dif)+1))
			stdout.flush()
		print "\nDownload complete."

	return targ_file

for i in range(1,5040):
	targ_url = base_url + path_suburl + (format_string % i)
	print "Checking %s" % targ_url
	song_req = retrieve_with_referer(targ_url)
	if song_req == None:
		fail(i)
		continue
	print "Page downloaded"
	page = lxml.html.fromstring(song_req.read())
	download_div = CSSSelector(css_selector)(page)
	try:
		if download_div[0].text == "Download":
			targ_download = download_div[0].attrib["href"]
			download_request = retrieve_with_referer(targ_download, targ_url)
			targ_filename = "%d.mp3"%i
			for header in download_request.info().headers:
				if get_filename(header) != None:
					targ_filename = get_filename(header)
					break
			download_file(download_request, data_dir+targ_filename)
		else:
			fail(i)
	except Exception,e:
		traceback.print_exc()
	# All the configuration is done on the file
	# It attempts to download tracks based on id, which I assume it's automatically incremented
	# Very naive but it works
	# Some of the code is based on code found in stackoverflow.com, all make it obvious in another revision
	# TODO: Make the progress display better

	import urllib2, lxml.html, traceback
	from lxml.cssselect import CSSSelector
	from sys import stdout

	base_url = "http://www.dubstep.net/"
	path_suburl = "track/"
	format_string = "%d"
	css_selector = ".track_type > a"
	data_dir = "/mnt/home/dubstep.net/"

	def retrieve_with_referer(url,referer=base_url):
	print url
	try:
	req = urllib2.Request(url)
	req.add_header('Referer', base_url)
	return urllib2.urlopen(req)
	except IOError,e:
	print e
	return None
	except Exception:
	traceback.print_exc()
	def sizeof_fmt(num):
	for x in ['bytes','KB','MB','GB']:
	if num < 1024.0 and num > -1024.0:
	return "%3.1f%s" % (num, x)
	num /= 1024.0
	return "%3.1f%s" % (num, 'TB')


	def get_filename(header):
	fn_header = "Content-Disposition: attachment;filename="
	filename_pos = len(fn_header)
	if header.find(fn_header) != -1:
	return header[filename_pos+1:].strip()[:-1]

	def fail(track_id):
	print "Track %d does not exist"%track_id

	def download_file(r, targ_file):
	print "Downloading: %s to %s" % (r.url, targ_file)
	with open(targ_file, 'wb') as f:
	meta_stuff = r.info()
	meta_func = meta_stuff.getheaders if hasattr(meta_stuff, 'getheaders') else meta_stuff.get_all
	meta_length = meta_func("Content-Length")
	file_size = None
	if meta_length:
	file_size = float(meta_length[0])
	file_size_dl = 0
	block_sz = 8192
	lowest_dif = 0
	while True:
	buffer = r.read(block_sz)
	if not buffer:
	break

	file_size_dl += len(buffer)
	f.write(buffer)

	if file_size:
	progress_string = "Progress: %s/%s (%.2f%%)" % (sizeof_fmt(file_size_dl), sizeof_fmt(file_size), (file_size_dl/file_size)*100)
	else:
	progress_string = "Progress: %s/unknown" % sizeof_fmt(file_size_dl)
	cur_len = len(progress_string)
	if file_size_dl <= 8192:
	last_len = len(progress_string)
	len_dif = cur_len - last_len
	if len_dif < 0:
	if len_dif < lowest_dif:
	lowest_dif = len_dif
	progress_string += " "*(lowest_dif)

	stdout.write("\r%s" % progress_string)

	stdout.flush()
	stdout.write("\b "*(abs(lowest_dif)+1))
	stdout.flush()
	print "\nDownload complete."

	return targ_file

	for i in range(1,5040):
	targ_url = base_url + path_suburl + (format_string % i)
	print "Checking %s" % targ_url
	song_req = retrieve_with_referer(targ_url)
	if song_req == None:
	fail(i)
	continue
	print "Page downloaded"
	page = lxml.html.fromstring(song_req.read())
	download_div = CSSSelector(css_selector)(page)
	try:
	if download_div[0].text == "Download":
	targ_download = download_div[0].attrib["href"]
	download_request = retrieve_with_referer(targ_download, targ_url)
	targ_filename = "%d.mp3"%i
	for header in download_request.info().headers:
	if get_filename(header) != None:
	targ_filename = get_filename(header)
	break
	download_file(download_request, data_dir+targ_filename)
	else:
	fail(i)
	except Exception,e:
	traceback.print_exc()