Last active
March 18, 2021 15:29
-
-
Save rubenhorn/c71d82adba694c6211f7a1f14dc79bb3 to your computer and use it in GitHub Desktop.
A script to process youtube playlist takeout
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import os, sys, requests, re, html | |
banner = ''' | |
#================================================== | |
# A script to process youtube playlist takeout | |
#================================================== | |
''' | |
print(banner) | |
if len(sys.argv) != 3: | |
print('Usage: {} <takeout playlists path> <output file>'.format(sys.argv[0])) | |
exit() | |
playlists_folder = os.path.abspath(sys.argv[1]) | |
output_filename = os.path.abspath(sys.argv[2]) | |
def get_playlist_files(playlists_folder): | |
csv_files = filter(lambda f: f.endswith('.csv'), os.listdir(playlists_folder)) | |
return [playlists_folder + '/' + f for f in csv_files] | |
VIDEO_BASE_URL = 'https://www.youtube.com/watch?v=' | |
def get_playlist_title(playlist_file): | |
title = None | |
with open(playlist_file, 'r') as file: | |
file.readline() | |
title = file.readline().split(',')[4] | |
return title | |
def get_playlist_video_ids(playlist_file): | |
lines = [] | |
with open(playlist_file, 'r') as file: | |
lines += file.readlines() | |
lines = lines[5:] | |
return [l.split(',')[0] for l in lines] | |
def get_video_info(video_id): | |
try: | |
request = requests.get(VIDEO_BASE_URL + video_id) | |
text = request.text | |
pattern_title = '<meta name="title" content="([^"]*)">' | |
title = html.unescape(re.search(pattern_title, text).group(1)) | |
pattern_channel_name = '"ownerChannelName":"([^"]*)"' | |
channel_name = html.unescape(re.search(pattern_channel_name, text).group(1)) | |
return (video_id, title, channel_name) | |
except: | |
return None | |
def export_html(playlist_infos, filename): | |
document_template = ''' | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<title>YouTube playlists</title> | |
</head> | |
<body> | |
<span id="top"></span> | |
<a href="#top" style="position: fixed; right: 10px; bottom: 10px">Go to top</a> | |
<iframe name="player" style="position: fixed; right: 10px; top: 10px"></iframe> | |
{playlists} | |
<script>Array.from(document.getElementsByTagName("h3")).forEach(e =>{e.onclick=()=>{e.nextElementSibling.hidden=!e.nextElementSibling.hidden;};e.onclick()})</script> | |
</body> | |
</html> | |
'''.strip() | |
def append_playlist(file, title, video_infos): | |
playlist_template = ''' | |
<h3>{title}</h3> | |
<ul>{videos}</ul> | |
'''.strip() | |
file.write(playlist_template.split('{videos}')[0].replace('{title}', title)) | |
for video_info in video_infos: | |
append_video(file, video_info) | |
file.write(playlist_template.split('{videos}')[1]) | |
def append_video(file, video_info): | |
video_template = ''' | |
<li> | |
(<a href="https://www.youtube.com/embed/{video_id}?autoplay=1" target="player"> | |
Play | |
</a>) <a href="{base_url}{video_id}" target="_blank"> | |
{title} | |
</a> - {channel_name} | |
</li> | |
'''.strip().replace('{base_url}', VIDEO_BASE_URL) | |
file.write(video_template.replace('{video_id}', video_info[0]).replace('{title}', video_info[1]).replace('{channel_name}', video_info[2])) | |
with open(filename, 'w', encoding='utf-8') as file: | |
file.write(document_template.split('{playlists}')[0]) | |
for playlist_title in playlist_infos: | |
append_playlist(file, playlist_title, playlist_infos[playlist_title]) | |
file.write(document_template.split('{playlists}')[1]) | |
print('STARTING\n') | |
playlist_infos = dict() | |
playlist_files = get_playlist_files(playlists_folder) | |
playlist_count = len(playlist_files) | |
for i in range(playlist_count): | |
playlist_file = playlist_files[i] | |
playlist_title = get_playlist_title(playlist_file) | |
print('Assembling playlist ({}/{}) :{}'.format(i + 1, playlist_count, playlist_title)) | |
video_ids = get_playlist_video_ids(playlist_file) | |
video_infos = [] | |
for id in video_ids: | |
video_info = get_video_info(id) | |
if video_info is not None: | |
print(' - {} by {}'.format(video_info[1], video_info[2])) | |
video_infos.append(video_info) | |
playlist_infos.update({playlist_title: video_infos}) | |
export_html(playlist_infos, output_filename) | |
print('\nDONE') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment