Skip to content

Instantly share code, notes, and snippets.

@ychalier
Created September 25, 2022 14:44
Show Gist options
  • Save ychalier/c72301e6204e204b86456bd5d3629de5 to your computer and use it in GitHub Desktop.
Save ychalier/c72301e6204e204b86456bd5d3629de5 to your computer and use it in GitHub Desktop.
Download tracks from a YouTube playlist and use an assisted Discogs search to tag them
"""Download tracks from a YouTube playlist and use an assisted Discogs search to tag them.
Requires yt-dlp, ffmpeg and a browser automation driver for selenium.
"""
import io
import os
import re
import glob
import json
import argparse
import datetime
import subprocess
import urllib.parse
import bs4
import selenium.webdriver
import mutagen.id3
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
STEVEARI_PLAYLIST_URL = "https://www.youtube.com/playlist?list=PLPYMYQXSh49BtzzotE0Bo4UrMw4TpDKda"
DELIMITER = "£$@"
DISCOGS_SEARCH_URL = "https://www.discogs.com/search"
def generate_output_folder(playlist_url):
return "%s-%s" % (
datetime.date.today().strftime("%Y-%m-%d"),
urllib.parse.parse_qs(urllib.parse.urlparse(playlist_url).query)["list"][0][:8]
)
def download_playlist(playlist_url, output_folder):
process = subprocess.Popen([
"yt-dlp",
"--extract-audio",
"--add-metadata",
"--xattrs",
"--embed-thumbnail",
"--audio-format",
"mp3",
"-o",
os.path.join(output_folder, "download", "%(channel)s" + DELIMITER + "%(title)s.%(ext)s"),
playlist_url
])
process.wait()
def squarify_thumbnails(output_folder):
for path in glob.glob(os.path.join(output_folder, "download", "*.mp3")):
audiofile = mutagen.id3.ID3(path)
try:
apic = audiofile['APIC:"Album cover"']
except KeyError:
continue
image_original = Image.open(io.BytesIO(apic.data))
width, height = image_original.size
size = min(width, height)
image_cropped = image_original.crop(((width - size) / 2, (height - size) / 2, (width + size) / 2, (height + size) / 2))
buffer = io.BytesIO()
image_cropped.save(buffer, "jpeg")
del audiofile['APIC:"Album cover"']
audiofile.add(mutagen.id3.APIC(encoding=3, mime="image/jpeg", data=buffer.getbuffer().tobytes()))
audiofile.save()
def get_image_mimetype(url):
if url.endswith(".jpg") or url.endswith(".jpeg"):
return "image/jpeg"
if url.endswith(".png"):
return "image/png"
raise ValueError("Could not guess mime type from URL: %s" % url)
class TrackTags:
def __init__(self, title=None, artist=None, album=None, year=None, label=None):
self.title = title
self.artist = artist
self.album = album
self.year = year
self.label = label
@classmethod
def from_html(cls, html, track_position):
soup = bs4.BeautifulSoup(html, features="html.parser")
release_data = json.loads(soup.find("script", {"id": "release_schema"}).text)
album = release_data["releaseOf"]["name"]
year = release_data["releaseOf"]["datePublished"]
artist = ", ".join(map(lambda artist: artist["name"], release_data["releaseOf"]["byArtist"]))
label = ", ".join(map(lambda label: label["name"], release_data["recordLabel"]))
title_span = next(filter(
lambda span: "trackTitle" in " ".join(span.get("class", [])),
soup.find("tr", {"data-track-position": track_position}).find_all("span"))
).text.strip()
title = re.sub("[\n ]+", " ", title_span)
return TrackTags(title, artist, album, year, label)
def apply(self, path):
audiofile = mutagen.id3.ID3(path)
apic = audiofile["APIC:"]
audiofile.delete()
audiofile.add(apic)
if self.title is not None:
audiofile.add(mutagen.id3.TIT2(encoding=3, text=str(self.title)))
if self.album is not None:
audiofile.add(mutagen.id3.TALB(encoding=3, text=self.album))
if self.year is not None:
audiofile.add(mutagen.id3.TDRC(encoding=3, text=str(self.year)))
if self.artist is not None:
audiofile.add(mutagen.id3.TPE1(encoding=3, text=self.artist))
if self.label is not None:
audiofile.add(mutagen.id3.TPUB(encoding=3, text=str(self.label)))
# response = requests.get(self.image_url)
# audiofile.add(mutagen.id3.APIC(encoding=3, mime=get_image_mimetype(self.image_url), data=response.content))
audiofile.save()
def build_search_query(path):
"""Build a search query for Discogs search from the name of the audio file.
"""
basename = os.path.splitext(os.path.basename(path))[0]
channel, title = basename.split(DELIMITER)
search_query = title
if channel.endswith(" - Topic"):
search_query = channel[:-8] + " " + search_query
return search_query
def create_selenium_driver():
driver = selenium.webdriver.Firefox()
driver.first_time_on_discogs = True
return driver
def inject_css(driver, css):
driver.execute_script(f"var el = document.createElement('style'), sheet; document.head.appendChild(el); sheet = el.sheet; sheet.insertRule('{ css }', 0)")
def assisted_tag_search(driver, query):
url = DISCOGS_SEARCH_URL + "?" + urllib.parse.urlencode({
"q": query,
"type": "release"
})
driver.get(url)
inject_css(driver, "#ad_top, div.sticky_footer { display: none; }")
if driver.first_time_on_discogs:
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#onetrust-reject-all-handler"))).click()
driver.first_time_on_discogs = False
print("\n" + query)
track_position = input("Track position (leave empty to skip)> ").strip()
if track_position != "":
html = driver.find_element(By.CSS_SELECTOR, "div#page").get_attribute("innerHTML")
return TrackTags.from_html(html, track_position)
return None
def search_for_tags(output_folder):
os.makedirs(os.path.join(output_folder, "tagged"), exist_ok=True)
driver = create_selenium_driver()
for path in glob.glob(os.path.join(output_folder, "download", "*.mp3")):
query = build_search_query(path)
tags = assisted_tag_search(driver, query)
if tags is not None:
tags.apply(path)
os.rename(path, os.path.join(output_folder, "tagged", f"{ tags.artist } - { tags.title }.mp3"))
driver.close()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", type=str, default=STEVEARI_PLAYLIST_URL)
parser.add_argument("-d", "--skip-download", action="store_true")
args = parser.parse_args()
output_folder = generate_output_folder(args.url)
if not args.skip_download:
download_playlist(args.url, output_folder)
squarify_thumbnails(output_folder)
search_for_tags(output_folder)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment