Created
September 25, 2022 14:44
-
-
Save ychalier/c72301e6204e204b86456bd5d3629de5 to your computer and use it in GitHub Desktop.
Download tracks from a YouTube playlist and use an assisted Discogs search to tag them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Download tracks from a YouTube playlist and use an assisted Discogs search to tag them. | |
Requires yt-dlp, ffmpeg and a browser automation driver for selenium. | |
""" | |
import io | |
import os | |
import re | |
import glob | |
import json | |
import argparse | |
import datetime | |
import subprocess | |
import urllib.parse | |
import bs4 | |
import selenium.webdriver | |
import mutagen.id3 | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from PIL import Image | |
STEVEARI_PLAYLIST_URL = "https://www.youtube.com/playlist?list=PLPYMYQXSh49BtzzotE0Bo4UrMw4TpDKda" | |
DELIMITER = "£$@" | |
DISCOGS_SEARCH_URL = "https://www.discogs.com/search" | |
def generate_output_folder(playlist_url): | |
return "%s-%s" % ( | |
datetime.date.today().strftime("%Y-%m-%d"), | |
urllib.parse.parse_qs(urllib.parse.urlparse(playlist_url).query)["list"][0][:8] | |
) | |
def download_playlist(playlist_url, output_folder): | |
process = subprocess.Popen([ | |
"yt-dlp", | |
"--extract-audio", | |
"--add-metadata", | |
"--xattrs", | |
"--embed-thumbnail", | |
"--audio-format", | |
"mp3", | |
"-o", | |
os.path.join(output_folder, "download", "%(channel)s" + DELIMITER + "%(title)s.%(ext)s"), | |
playlist_url | |
]) | |
process.wait() | |
def squarify_thumbnails(output_folder): | |
for path in glob.glob(os.path.join(output_folder, "download", "*.mp3")): | |
audiofile = mutagen.id3.ID3(path) | |
try: | |
apic = audiofile['APIC:"Album cover"'] | |
except KeyError: | |
continue | |
image_original = Image.open(io.BytesIO(apic.data)) | |
width, height = image_original.size | |
size = min(width, height) | |
image_cropped = image_original.crop(((width - size) / 2, (height - size) / 2, (width + size) / 2, (height + size) / 2)) | |
buffer = io.BytesIO() | |
image_cropped.save(buffer, "jpeg") | |
del audiofile['APIC:"Album cover"'] | |
audiofile.add(mutagen.id3.APIC(encoding=3, mime="image/jpeg", data=buffer.getbuffer().tobytes())) | |
audiofile.save() | |
def get_image_mimetype(url): | |
if url.endswith(".jpg") or url.endswith(".jpeg"): | |
return "image/jpeg" | |
if url.endswith(".png"): | |
return "image/png" | |
raise ValueError("Could not guess mime type from URL: %s" % url) | |
class TrackTags: | |
def __init__(self, title=None, artist=None, album=None, year=None, label=None): | |
self.title = title | |
self.artist = artist | |
self.album = album | |
self.year = year | |
self.label = label | |
@classmethod | |
def from_html(cls, html, track_position): | |
soup = bs4.BeautifulSoup(html, features="html.parser") | |
release_data = json.loads(soup.find("script", {"id": "release_schema"}).text) | |
album = release_data["releaseOf"]["name"] | |
year = release_data["releaseOf"]["datePublished"] | |
artist = ", ".join(map(lambda artist: artist["name"], release_data["releaseOf"]["byArtist"])) | |
label = ", ".join(map(lambda label: label["name"], release_data["recordLabel"])) | |
title_span = next(filter( | |
lambda span: "trackTitle" in " ".join(span.get("class", [])), | |
soup.find("tr", {"data-track-position": track_position}).find_all("span")) | |
).text.strip() | |
title = re.sub("[\n ]+", " ", title_span) | |
return TrackTags(title, artist, album, year, label) | |
def apply(self, path): | |
audiofile = mutagen.id3.ID3(path) | |
apic = audiofile["APIC:"] | |
audiofile.delete() | |
audiofile.add(apic) | |
if self.title is not None: | |
audiofile.add(mutagen.id3.TIT2(encoding=3, text=str(self.title))) | |
if self.album is not None: | |
audiofile.add(mutagen.id3.TALB(encoding=3, text=self.album)) | |
if self.year is not None: | |
audiofile.add(mutagen.id3.TDRC(encoding=3, text=str(self.year))) | |
if self.artist is not None: | |
audiofile.add(mutagen.id3.TPE1(encoding=3, text=self.artist)) | |
if self.label is not None: | |
audiofile.add(mutagen.id3.TPUB(encoding=3, text=str(self.label))) | |
# response = requests.get(self.image_url) | |
# audiofile.add(mutagen.id3.APIC(encoding=3, mime=get_image_mimetype(self.image_url), data=response.content)) | |
audiofile.save() | |
def build_search_query(path): | |
"""Build a search query for Discogs search from the name of the audio file. | |
""" | |
basename = os.path.splitext(os.path.basename(path))[0] | |
channel, title = basename.split(DELIMITER) | |
search_query = title | |
if channel.endswith(" - Topic"): | |
search_query = channel[:-8] + " " + search_query | |
return search_query | |
def create_selenium_driver(): | |
driver = selenium.webdriver.Firefox() | |
driver.first_time_on_discogs = True | |
return driver | |
def inject_css(driver, css): | |
driver.execute_script(f"var el = document.createElement('style'), sheet; document.head.appendChild(el); sheet = el.sheet; sheet.insertRule('{ css }', 0)") | |
def assisted_tag_search(driver, query): | |
url = DISCOGS_SEARCH_URL + "?" + urllib.parse.urlencode({ | |
"q": query, | |
"type": "release" | |
}) | |
driver.get(url) | |
inject_css(driver, "#ad_top, div.sticky_footer { display: none; }") | |
if driver.first_time_on_discogs: | |
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#onetrust-reject-all-handler"))).click() | |
driver.first_time_on_discogs = False | |
print("\n" + query) | |
track_position = input("Track position (leave empty to skip)> ").strip() | |
if track_position != "": | |
html = driver.find_element(By.CSS_SELECTOR, "div#page").get_attribute("innerHTML") | |
return TrackTags.from_html(html, track_position) | |
return None | |
def search_for_tags(output_folder): | |
os.makedirs(os.path.join(output_folder, "tagged"), exist_ok=True) | |
driver = create_selenium_driver() | |
for path in glob.glob(os.path.join(output_folder, "download", "*.mp3")): | |
query = build_search_query(path) | |
tags = assisted_tag_search(driver, query) | |
if tags is not None: | |
tags.apply(path) | |
os.rename(path, os.path.join(output_folder, "tagged", f"{ tags.artist } - { tags.title }.mp3")) | |
driver.close() | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-u", "--url", type=str, default=STEVEARI_PLAYLIST_URL) | |
parser.add_argument("-d", "--skip-download", action="store_true") | |
args = parser.parse_args() | |
output_folder = generate_output_folder(args.url) | |
if not args.skip_download: | |
download_playlist(args.url, output_folder) | |
squarify_thumbnails(output_folder) | |
search_for_tags(output_folder) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment