Last active
August 29, 2015 14:05
-
-
Save kylemcdonald/5988b56dbf085db9f5ff to your computer and use it in GitHub Desktop.
Scrape a list of Spotify track IDs for JSON and preview URLs. Can handle very large lists of tracks by using subdirectories based on the track ID. Picks up where it lefts off. Ignores errors during downloading.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import json, urllib | |
import os.path | |
import subprocess | |
import argparse | |
from subprocess import call | |
parser = argparse.ArgumentParser( | |
description='Scrape a list of Spotify track IDs for JSON and preview URLs.') | |
parser.add_argument('filename') | |
args = parser.parse_args() | |
filename = args.filename | |
def download(remote, local): | |
# print("{} > {}".format(remote, local)) | |
# urllib.urlretrieve(data['preview_url'], outMp3) # urllib version | |
call(['curl', remote, '--output', local, '--create-dirs', '--progress-bar']) # curl version | |
def safe(char): | |
if char.isupper(): | |
char = '+'+char | |
return char | |
def linecount(filename): | |
return sum(1 for line in open(filename)) | |
i = 0 | |
n = linecount(filename) | |
with open(filename) as f: | |
for line in f: | |
track = line.rstrip() | |
jsonUrl = 'https://api.spotify.com/v1/tracks/{}'.format(track) | |
path = '{}/{}/{}/{}'.format(safe(track[0]), safe(track[1]), safe(track[2]), track) | |
outMp3 = 'mp3/{}.mp3'.format(path) | |
outJson = 'json/{}.json'.format(path) | |
try: | |
needJson = not os.path.isfile(outJson) | |
needMp3 = not os.path.isfile(outMp3) | |
if needJson or needMp3: | |
print('loading {} ({:.2f}%)'.format(track, (100. * i) / n)) | |
if needJson: | |
download(jsonUrl, outJson) | |
else: | |
print('[json downloaded]') | |
if needMp3: | |
data = json.load(open(outJson)) | |
if 'preview_url' in data and data['preview_url']: | |
download(data['preview_url'], outMp3) | |
else: | |
print('[mp3 not available]') | |
else: | |
print('[mp3 downloaded]') | |
except KeyboardInterrupt as e: | |
break | |
except Exception as e: | |
print e | |
i += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment