Skip to content

Instantly share code, notes, and snippets.

@luk1337
Last active July 31, 2016 14:39
Show Gist options
  • Save luk1337/f5478603bd9e9adcd8dc to your computer and use it in GitHub Desktop.
Save luk1337/f5478603bd9e9adcd8dc to your computer and use it in GitHub Desktop.
god this is awful: Bandcamp thief
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import urllib.request, re, os, sys, lxml.html, requests, mutagen, wgetter
from dateutil.parser import parse
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, APIC, TPE1, TALB, TIT2, TCON, TYER, TRCK, error
if len(sys.argv) is not 3:
print("Usage: python script.py [bandcamp link] [genre]")
exit()
data = urllib.request.urlopen(sys.argv[1]).read()
regex = b'"mp3-128":"(//(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)"'
regex_cover = b'artFullsizeUrl: "(.*)",'
regex_artist = b'artist: "(.*)",'
regex_date = b'album_release_date: "(.*)",'
regex_album = b'album_title: "(.*)",'
track_num = 0
# html parser
html = lxml.html.fromstring(data.decode('utf-8'))
urls = re.findall(regex, data)
artist = re.findall(regex_artist, data)[0].decode("utf-8")
cover = re.findall(regex_cover, data)[0]
songs = html.xpath('//span[@itemprop="name"]//text()')
year = re.findall(regex_date, data)[0].decode("utf-8")
year = str(parse(year).year)
if len(songs) == 0:
songs = [ html.xpath('//*[@id="name-section"]/h2//text()')[0].strip() ]
if len(re.findall(regex_album, data)) == 1:
album = re.findall(regex_album, data)[0].decode("utf-8")
else:
album = songs[0]
def download(track_num, label, url, out, shouldTag=True):
if hasattr(url, 'decode'):
url = url.decode()
filename = wgetter.download(url, outdir='out')
os.rename(filename, out)
if shouldTag:
tag(out, label, track_num)
def tag(file, title, track_num):
audio = MP3(file, ID3=ID3)
# add ID3 tag if it doesn't exist
try:
audio.add_tags()
except error:
pass
audio.tags.add(TPE1(encoding=3, text=artist))
audio.tags.add(TALB(encoding=3, text=album))
audio.tags.add(TIT2(encoding=3, text=title))
audio.tags.add(TCON(encoding=3, text=sys.argv[2]))
audio.tags.add(TYER(encoding=3, text=year))
audio.tags.add(TRCK(encoding=3, text=str(track_num)))
audio.tags.add(APIC(encoding=3, mime="image/jpeg", type=3, desc="Cover", data=open("out/%s/%s/cover.jpg" % (artist, album), "rb").read()))
audio.save()
# Create out/[album name] folder
if not os.path.exists("out/%s/%s" % (artist, album)):
os.makedirs("out/%s/%s" % (artist, album))
# Download cover
download(None, None, cover, "out/%s/%s/cover.jpg" % (artist, album), False)
# Download songs
for url in urls:
track_num += 1
song = songs[track_num - 1]
song = str(song)
url = "http:%s" % url.decode("utf-8")
out = "out/%s/%s/%d. %s.mp3" % (artist, album, track_num, song.replace("/", ""))
download(track_num, song, url, out)
# Remove cover art once we're done
os.remove("out/%s/%s/cover.jpg" % (artist, album))
#!/usr/bin/env python
"""
Wgetter is another command line download utility written completely in python.
It is based on python-wget (https://bitbucket.org/techtonik/python-wget/src) with some improvements.
It works on python >= 2.6 or python >=3.0 Runs on Windows or Linux or Mac
API Usage:
>>> import wgetter
>>> filename = wgetter.download('https://sites.google.com/site/doctormike/pacman-1.2.tar.gz', outdir='/home/user')
100 % [====================================================>] 19.9KiB / 19.9KiB 100.0KiB/s eta 0:00:01
>>> filename
'/home/user/pacman-1.2.tar.gz'
"""
import sys
import os
import shutil
import tempfile
import hashlib
import datetime
from time import time
PY3K = sys.version_info >= (3, 0)
if PY3K:
import urllib.request as ulib
import urllib.parse as urlparse
else:
import urllib2 as ulib
import urlparse
SUFFIXES = {1000: ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'],
1024: ['KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']}
def approximate_size(size, a_kilobyte_is_1024_bytes=True):
'''
Humansize.py from Dive into Python3
Mark Pilgrim - http://www.diveintopython3.net/
Copyright (c) 2009, Mark Pilgrim, All rights reserved.
Convert a file size to human-readable form.
Keyword arguments:
size -- file size in bytes
a_kilobyte_is_1024_bytes -- if True (default), use multiples of 1024
if False, use multiples of 1000
Returns: string
'''
size = float(size)
if size < 0:
raise ValueError('number must be non-negative')
multiple = 1024 if a_kilobyte_is_1024_bytes else 1000
for suffix in SUFFIXES[multiple]:
size /= multiple
if size < multiple:
return '{0:.1f}{1}'.format(size, suffix)
raise ValueError('number too large')
def get_console_width():
"""Return width of available window area. Autodetection works for
Windows and POSIX platforms. Returns 80 for others
Code from http://bitbucket.org/techtonik/python-pager
"""
if os.name == 'nt':
STD_INPUT_HANDLE = -10
STD_OUTPUT_HANDLE = -11
STD_ERROR_HANDLE = -12
# get console handle
from ctypes import windll, Structure, byref
try:
from ctypes.wintypes import SHORT, WORD, DWORD
except ImportError:
# workaround for missing types in Python 2.5
from ctypes import (
c_short as SHORT, c_ushort as WORD, c_ulong as DWORD)
console_handle = windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
# CONSOLE_SCREEN_BUFFER_INFO Structure
class COORD(Structure):
_fields_ = [("X", SHORT), ("Y", SHORT)]
class SMALL_RECT(Structure):
_fields_ = [("Left", SHORT), ("Top", SHORT),
("Right", SHORT), ("Bottom", SHORT)]
class CONSOLE_SCREEN_BUFFER_INFO(Structure):
_fields_ = [("dwSize", COORD),
("dwCursorPosition", COORD),
("wAttributes", WORD),
("srWindow", SMALL_RECT),
("dwMaximumWindowSize", DWORD)]
sbi = CONSOLE_SCREEN_BUFFER_INFO()
ret = windll.kernel32.GetConsoleScreenBufferInfo(
console_handle, byref(sbi))
if ret == 0:
return 0
return sbi.srWindow.Right + 1
elif os.name == 'posix':
from fcntl import ioctl
from termios import TIOCGWINSZ
from array import array
winsize = array("H", [0] * 4)
try:
ioctl(sys.stdout.fileno(), TIOCGWINSZ, winsize)
except IOError:
pass
return (winsize[1], winsize[0])[0]
return 80
CONSOLE_WIDTH = get_console_width()
# Need 2 spaces more to avoid linefeed on Windows
AVAIL_WIDTH = CONSOLE_WIDTH - 59 if os.name == 'nt' else CONSOLE_WIDTH - 57
def filename_from_url(url):
""":return: detected filename or None"""
fname = os.path.basename(urlparse.urlparse(url).path)
if len(fname.strip(" \n\t.")) == 0:
return None
return fname
def filename_from_headers(headers):
"""Detect filename from Content-Disposition headers if present.
http://greenbytes.de/tech/tc2231/
:param: headers as dict, list or string
:return: filename from content-disposition header or None
"""
if type(headers) == str:
headers = headers.splitlines()
if type(headers) == list:
headers = dict([x.split(':', 1) for x in headers])
cdisp = headers.get("Content-Disposition")
if not cdisp:
return None
cdtype = cdisp.split(';')
if len(cdtype) == 1:
return None
if cdtype[0].strip().lower() not in ('inline', 'attachment'):
return None
# several filename params is illegal, but just in case
fnames = [x for x in cdtype[1:] if x.strip().startswith('filename=')]
if len(fnames) > 1:
return None
name = fnames[0].split('=')[1].strip(' \t"')
name = os.path.basename(name)
if not name:
return None
return name
def filename_fix_existing(filename, dirname):
"""Expands name portion of filename with numeric ' (x)' suffix to
return filename that doesn't exist already.
"""
name, ext = filename.rsplit('.', 1)
names = [x for x in os.listdir(dirname) if x.startswith(name)]
names = [x.rsplit('.', 1)[0] for x in names]
suffixes = [x.replace(name, '') for x in names]
# filter suffixes that match ' (x)' pattern
suffixes = [x[2:-1] for x in suffixes
if x.startswith(' (') and x.endswith(')')]
indexes = [int(x) for x in suffixes
if set(x) <= set('0123456789')]
idx = 1
if indexes:
idx += sorted(indexes)[-1]
return '{0}({1}).{2}'.format(name, idx, ext)
def report_bar(bytes_so_far, total_size, speed, eta):
'''
This callback for the download function is used to print the download bar
'''
percent = int(bytes_so_far * 100 / total_size)
current = approximate_size(bytes_so_far).center(9)
total = approximate_size(total_size).center(9)
shaded = int(float(bytes_so_far) / total_size * AVAIL_WIDTH)
sys.stdout.write(
" {0}% [{1}{2}{3}] {4}/{5} {6} eta{7}".format(str(percent).center(4),
'=' * (shaded - 1),
'>',
' ' * (AVAIL_WIDTH - shaded),
current,
total,
(approximate_size(speed) + '/s').center(11),
eta.center(10)))
sys.stdout.write("\r")
sys.stdout.flush()
def report_unknown(bytes_so_far, total_size, speed, eta):
'''
This callback for the download function is used
when the total size is unknown
'''
sys.stdout.write(
"Downloading: {0} / Unknown - {1}/s ".format(approximate_size(bytes_so_far),
approximate_size(speed)))
sys.stdout.write("\r")
sys.stdout.flush()
def report_onlysize(bytes_so_far, total_size, speed, eta):
'''
This callback for the download function is used when console width
is not enough to print the bar.
It prints only the sizes
'''
percent = int(bytes_so_far * 100 / total_size)
current = approximate_size(bytes_so_far).center(10)
total = approximate_size(total_size).center(10)
sys.stdout.write('D: {0}% -{1}/{2}'.format(percent, current, total) + "eta {0}".format(eta))
sys.stdout.write("\r")
sys.stdout.flush()
def md5sum(filename, blocksize=8192):
'''
Returns the MD5 checksum of a file
'''
with open(filename, 'rb') as fh:
m = hashlib.md5()
while True:
data = fh.read(blocksize)
if not data:
break
m.update(data)
return m.hexdigest()
def download(link, outdir='.', chunk_size=4096):
'''
This is the Main function, which downloads a given link
and saves on outdir (default = current directory)
'''
url = None
fh = None
eta = 'unknown '
bytes_so_far = 0
filename = filename_from_url(link) or "."
# get filename for temp file in current directory
(fd_tmp, tmpfile) = tempfile.mkstemp(
".tmp", prefix=filename + ".", dir=outdir)
os.close(fd_tmp)
os.unlink(tmpfile)
try:
url = ulib.urlopen(link)
fh = open(tmpfile, mode='wb')
headers = url.info()
try:
total_size = int(headers['Content-Length'])
except (ValueError, KeyError, TypeError):
total_size = 'unknown'
try:
md5_header = headers['Content-MD5']
except (ValueError, KeyError, TypeError):
md5_header = None
# Define which callback we're gonna use
if total_size != 'unknown':
if CONSOLE_WIDTH > 57:
reporthook = report_bar
else:
reporthook = report_onlysize
else:
reporthook = report_unknown
# Below are the registers to calculate network transfer rate
time_register = time()
speed = 0.0
speed_list = []
bytes_register = 0.0
eta = 'unknown '
# Loop that reads in chunks, calculates speed and does the callback to
# print the progress
while True:
chunk = url.read(chunk_size)
# Update Download Speed every 1 second
if time() - time_register > 0.5:
speed = (bytes_so_far - bytes_register) / \
(time() - time_register)
speed_list.append(speed)
# Set register properly for future use
time_register = time()
bytes_register = bytes_so_far
# Estimative of remaining download time
if total_size != 'unknown' and len(speed_list) == 3:
speed_mean = sum(speed_list) / 3
eta_sec = int((total_size - bytes_so_far) / speed_mean)
eta = str(datetime.timedelta(seconds=eta_sec))
speed_list = []
bytes_so_far += len(chunk)
if not chunk:
sys.stdout.write('\n')
break
fh.write(chunk)
reporthook(bytes_so_far, total_size, speed, eta)
except KeyboardInterrupt:
print('\n\nCtrl + C: Download aborted by user')
print('Partial downloaded file:\n{0}'.format(os.path.abspath(tmpfile)))
sys.exit(1)
finally:
if url:
url.close()
if fh:
fh.close()
filenamealt = filename_from_headers(headers)
if filenamealt:
filename = filenamealt
# add numeric '(x)' suffix if filename already exists
'''if os.path.exists(os.path.join(outdir, filename)):
filename = filename_fix_existing(filename, outdir)
filename = os.path.join(outdir, filename)'''
shutil.move(tmpfile, filename)
# Check if sizes matches
if total_size != 'unknown' and total_size != bytes_so_far:
print(
'\n\nWARNING!! Downloaded file size mismatches... Probably corrupted...')
# Check md5 if it was in html header
if md5_header:
print('\nValidating MD5 checksum...')
if md5_header == md5sum(filename):
print('MD5 checksum passed!')
else:
print('MD5 checksum do NOT passed!!!')
return filename
if __name__ == '__main__':
if len(sys.argv) == 1 or sys.argv[1] in {'-h', '--help'}:
print('Usage: {0} <URL>'.format(sys.argv[0]))
args = [str(elem) for elem in sys.argv[1:]]
for link in args:
print('Downloading ' + link)
filename = download(link)
print('\nSaved under {0}'.format(filename))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment