Skip to content

Instantly share code, notes, and snippets.

@Yiannis128
Last active May 16, 2021 12:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Yiannis128/4a9c016236edf41493176a59bb0a1be0 to your computer and use it in GitHub Desktop.
Save Yiannis128/4a9c016236edf41493176a59bb0a1be0 to your computer and use it in GitHub Desktop.
This is a python script that extracts your YouTube subscriptions into an RSS feed. Simply provide as an argument to the script, the path where you have the html file saved of the https://www.youtube.com/feed/channels make sure to check the whole page is saved because the script can only see the channels that are in that file.
#!/usr/bin/env python3
# Licence: GPLV3
import requests
from bs4 import BeautifulSoup as soup
from sys import argv as argv
from time import sleep
def get_channel_id(url):
def get_channel_name():
# Title is of the form of "NAME - YouTube" so it needs
# to be extracted.
title = parser.title.text
assert(title != None)
return str.replace(title, " - YouTube", "")
channel_html = requests.get(url)
parser = soup(channel_html.text, "html.parser")
# Get the channel name
channel_name = get_channel_name()
# Loop through all the meta elements, one of them contains
# the channel id as an attribute.
# The meta tags with channel id are of the form:
# <meta itemprop="channelId" content="....">
for meta_element in parser.find_all("meta"):
# Does the itemprop exist in this meta tag?
if "itemprop" in meta_element.attrs.keys():
# The meta tag that has the itemprop attribute set
# as channelId contains the channel id.
if meta_element.attrs["itemprop"] == "channelId":
# Channel id found, return it.
channel_id = meta_element.attrs["content"]
assert(channel_id != None)
return (channel_name, channel_id)
def get_ch_ids_list(parser):
def get_channel_url():
# Get the element with id main-link because it holds a href attribute
# to the channel.
main_link_tag = channel_html.find(id="main-link")
# Return the channel url string
return main_link_tag.attrs["href"]
# Youtube does not load all channels in grid container.
# Instead it loads multiple grid containers containing chunks
# of the channels. This is because channels are loaded
# as you scroll down the page.
# So we need to parse all the grid containers.
grid_containers = parser.find_all(id="grid-container")
for grid_container in grid_containers:
# Iterate through channels on the list.
for channel_html in grid_container.children:
# If we get an error while doing this, then try again until it succeeds.
while True:
try:
channel_url = get_channel_url()
# Get the channel id from the url.
channel_name, channel_id = get_channel_id(channel_url)
# Print the string to stdout
# print("https://www.youtube.com/feeds/videos.xml?channel_id=" + channel_id, "\"~" + channel_name + "\" \"YouTube\"")
print("#", channel_name)
print("https://www.youtube.com/feeds/videos.xml?channel_id=" + channel_id, "\"!YouTube\"")
# Sleep 1 second as to not piss off YT.
sleep(1)
# Break out of the true loop.
break
except Exception as error:
# If we get an error, then retry.
print("!Error: " + str(error), "Retrying...")
if __name__ == "__main__":
channels_html_path = argv[1]
print("Using html file: " + channels_html_path)
with open(channels_html_path, "r") as file:
print("# Youtube Subscriptions")
parser = soup(file.read(), 'html.parser')
channel_ids = get_ch_ids_list(parser)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment