Created
May 14, 2012 09:36
-
-
Save aperson/2692960 to your computer and use it in GitHub Desktop.
Rips music from bandcamp that's on Dewtroid's source list.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin env python3 | |
import re | |
import json | |
import os | |
import sys | |
import time | |
import signal | |
import urllib.request | |
opener = urllib.request.build_opener() | |
def sigint_handler(signal, frame): | |
'''Handles ^c''' | |
print('Recieved SIGINT! Exiting...') | |
sys.exit(0) | |
def unescape(s): | |
s = s.replace("<", "<") | |
s = s.replace(">", ">") | |
s = s.replace(""", "\"") | |
s = s.replace("​", " ") # How in the hell? | |
s = s.replace("&", "&") # This needs to be last | |
return s | |
def get_list(): | |
cache = os.path.join('dew_music', 'music_list.json') | |
try: | |
os.mkdir('dew_music') | |
except OSError: | |
pass | |
try: | |
with open(cache, encoding='utf-8') as f: | |
return json.loads(f.read()) | |
except IOError: | |
with opener.open('https://docs.google.com/spreadsheet/pub?' | |
'key=0AhSz7tnijafVdFJBM1ZRQ2hBU1BtTXBkZWRVak1CelE&output=csv') as w: | |
csv_data = w.read().decode('utf-8') | |
music = dict() | |
for i in csv_data.split('\n'): | |
url, cat = i.split(',') | |
cat = re.sub(r'\W', '', cat).lower() | |
if cat not in music: | |
music[cat] = [url] | |
elif url not in music[cat]: | |
music[cat].append(url) | |
with open(cache, 'w') as f: | |
f.write(json.dumps(music, indent=2)) | |
return music | |
def download_songs(url, category): | |
try: | |
with opener.open(url) as w: | |
page = w.read().decode('utf-8') | |
except urllib.error.HTTPError: | |
print("404! Artist/Album may not exist! Skipping.") | |
time.sleep(3) | |
return | |
mp3s = re.findall(r'''"file":"(http:.*?)"''', page) | |
titles = re.findall(r'''itemprop="url"><span itemprop="name">(.*?)</span>''', page) | |
if not titles: | |
titles = re.findall(r'''<h2 class="trackTitle" itemprop="name">\s*(.*?)\s*</h2>''', page) | |
titles = [unescape(i) for i in titles] | |
titles_clean = [re.sub(r'''\W''', '_', i) for i in titles] | |
album = re.findall(r'''<title>(.*?) \|''', page)[0] | |
album_clean = re.sub(r'''\W''', '_', album) | |
artist = re.findall(r'''\| (.*?)</title>''', page)[0] | |
artist_clean = re.sub(r'''\W''', '_', artist) | |
base_path = os.path.join('dew_music', category, artist_clean, album_clean) | |
sauce_path = os.path.join(base_path, 'sauce.txt') | |
if os.path.isfile(sauce_path): | |
print("{} by {} already downloaded! Skipping.".format(album, artist)) | |
else: | |
print("Retrieving {} by {}.".format(album, artist)) | |
try: | |
x = '' | |
for y in base_path.split(os.sep): | |
x = os.path.join(x, y) | |
if not os.path.isdir(x): | |
os.mkdir(x) | |
except OSError: | |
pass | |
for i, title in enumerate(titles_clean): | |
mp3_path = os.path.join(base_path, "{:02d}_{}.mp3".format(i+1, title)) | |
print("Downloading {:02d} {}".format(i+1, titles[i])) | |
if not mp3s: | |
print("Music not free! Skipping.") | |
break | |
try: | |
with opener.open(mp3s[i].replace('\\', '')) as w: | |
with open(mp3_path, 'wb') as f: | |
f.write(w.read()) | |
except urllib.error.HTTPError: | |
print("404! Failed to download track! Skipping!") | |
time.sleep(3) | |
break | |
except IndexError: | |
print("Not all of this album is free! Making note in folder.") | |
with open(os.path.join(base_path, 'WARNING.txt'), 'w') as f: | |
f.write("Warning, the track titles may be inaccurate for this album.\n" | |
"One or more songs may have been non-free!") | |
with open(sauce_path, 'w') as f: | |
f.write(url + '\n' + '\n'.join(["{:02d} {}".format(x+1, y) | |
for x, y in enumerate(titles)])) | |
def main(): | |
args = sys.argv[1:] | |
if args and args[0] == "--list": | |
music = get_list() | |
print("The list of categories to choose from are:") | |
print(", ".join(music.keys())) | |
print("You can specify multiple categories or 'all'.") | |
elif args and args[0] == "--get" and len(args) >= 2: | |
music = get_list() | |
if 'all' in args[1:]: | |
for x in music: | |
for y in music[x]: | |
download_songs(y, x) | |
else: | |
for x in args[1:]: | |
if x in music: | |
for y in music[x]: | |
download_songs(y, x) | |
else: | |
print("{} is not a recognized category! Skipping.".format(x)) | |
else: | |
print("Use --list to see the categories available and --get to download the music from" | |
" said categories.\nWhen specifying categories, you can list multiple or 'all'.\n" | |
"Sorry there's no tagging, I hope your media player can use the directory " | |
"structure fot figure the tags out. A sauce.txt file is included along with every " | |
"'album' which links to the source and includes the non-sanitized track names.\n" | |
"This script supports very limited resuming in case of a failed download.") | |
if __name__ == '__main__': | |
signal.signal(signal.SIGINT, sigint_handler) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment