Skip to content

Instantly share code, notes, and snippets.

@gerryjenkinslb
Last active June 20, 2018 21:44
Show Gist options
  • Save gerryjenkinslb/a2bce353e4f08a43a4aa40ec24b5eca0 to your computer and use it in GitHub Desktop.
Save gerryjenkinslb/a2bce353e4f08a43a4aa40ec24b5eca0 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import re
import requests
import csv
import time
import json
# version 1.1 added handling of youtube.com/channel/
# to already handling of youtube.com/user based channels
# NOTE: url gets to youtube are throttled to 3 seconds between requests
# this is an ad hoc attempt to look like a human to youtube
# so youtube does not start limiting access
wait_between_requests = 3
""" scrape youtube channel to build table of contents html file and
csv of video information for excel file
note this code has a slow down delay to meet youtube terms of use
"""
# set youtube channel name here
channel_name = 'gjenkinslbcc'
youtube_base = 'https://www.youtube.com/'
# others to try
# gotreehouse howgrowvideo gjenkinslbcc howgrowvideo
# by channel name:
# UCu8YylsPiu9XfaQC74Hr_Gw UCn34N9fj3x92kOGnQQdHkAQ
# UCH4aPBlmmW1Vgs0ykktCMUg
parent_folder = '' # users or channel or empty
def get_soup(url):
"""open url and return BeautifulSoup object,
or None if site does not exist"""
result = requests.get(url)
if result.status_code != 200: return None
time.sleep(wait_between_requests) # slow down to human speed
return BeautifulSoup(result.text, 'html.parser')
def channel_section_links(channel_name):
"""list of
{ 'title': <section title>,
'link': <url to section play lists>
}"""
global parent_folder
soup = get_soup(f'{youtube_base}/user/{channel_name}/playlists')
if soup is None or 'This channel does not exist.' in soup.text:
url = f'{youtube_base}/channel/{channel_name}/playlists'
soup = get_soup(url)
if soup is None or 'This channel does not exist.' in soup.text:
raise ValueError(
'The channel does not exists: ' + channel_name)
parent_folder = 'channel/'
play_list_atags = \
soup.find_all('a',
{'href': re.compile(f'{channel_name}/playlists')})
# filter out non user play lists next
elements = [{'title': x.text.strip(),
'link': fix_url(x['href'])} for x in play_list_atags
if x.span and
('shelf_id=0' not in x['href'])]
# no sections, make up no sections section with default link
if len(elements) == 0:
url = f'{youtube_base}{parent_folder}{channel_name}/playlists'
elements = [ {'title': 'no sections', 'link': url}]
# i.e. https://youtube.com/gotreehouse/playlists
return elements
def fix_url(url): # correct relative urls back to absolute urls
if url[0] == '/':
return youtube_base + url
else:
return url
def get_playlists(section):
"""returns list of list of
{ 'title': <playlist tile>, <link to all playlist videos> }"""
global parent_folder
print(f" getting playlists for section: {section['title']}")
soup = get_soup(section['link'])
if soup is None: # no playlist, create dummy with default link
url = f'{youtube_base}{parent_folder}{channel_name}/videos'
return [
{'title': 'No Playlists', 'link':url }]
atags = soup('a', class_='yt-uix-tile-link')
playlists = []
for a in atags: # find title and link
title = a.text
if title != 'Liked videos': # skip these
url = fix_url(a['href'])
playlists.append({'title': title, 'link': url})
if not playlists: # no playlists
url = f'{youtube_base}/{parent_folder}{channel_name}/videos'
return [{'title': 'No Playlists', 'link': url}]
return playlists
def parse_video(vurl):
# return dict of
# title, link, views, publication_date,
# description, short_link, likes, dislikes
d = {'link': vurl, 'views': None, 'short_link': vurl,
'likes': None, 'dislikes': None}
# now get video page and pull information from it
vsoup = get_soup(vurl)
o = vsoup.find('title')
vtitle = o.text.strip()
xending = ' - YouTube'
d['title'] = vtitle[:-len(xending)] \
if vtitle.endswith(xending) else vtitle
print(f" processing video '{d['title']}'" )
# o is used in the code following to
# catch missing data targets for scrapping
o = vsoup.find('div', class_='watch-view-count')
if o:
views = o.text
d['views'] = ''.join(c for c in views if c in '0123456789')
o = vsoup.find('strong', class_='watch-time-text')
d['publication_date'] = \
o.text[len('Published on ') - 1:] if o else ''
o = vsoup.find('div', id='watch-description-text')
d['description'] = o.text if o else ''
o = vsoup.find('meta', itemprop='videoId')
if o:
vid = o['content']
d['short_link'] = f'https://youtu.be/{vid}'
o = vsoup.find('button',
class_='like-button-renderer-like-button')
if o:
o = o.find('span', class_='yt-uix-button-content')
d['likes'] = o.text if o else ''
o = vsoup.find('button',
class_='like-button-renderer-dislike-button')
if o:
o = o.find('span', class_='yt-uix-button-content')
d['dislikes'] = o.text if o else ''
return d
def add_videos(playlist):
"""find videos in playlist[link]
and add their info as playlist[videos] as list"""
surl = playlist['link']
soup = get_soup(surl)
print(f" getting videos for playlist: {playlist['title']}")
videos = []
# items are list of video a links from list
items = soup('a', class_='yt-uix-tile-link')
# note first part of look get info from playlist page item,
# and the the last part opens the video and gets more details
if len(items) > 0:
for i in items:
d = dict()
vurl = fix_url(i['href'])
t = i.find_next('span', {'aria-label': True})
d['time'] = t.text if t else 'NA'
d.update(parse_video(vurl))
videos.append(d)
else: # must be only one video
d = {'time': 'NA'}
d.update(parse_video(surl))
videos.append(d)
# add new key to this playlist of list of video infos
playlist['videos'] = videos
print()
def tag(t,c):
return f'<{t}>{c}</{t}>' # return html tag with content
def link(text, url): # return a tag with content and link
return f'<a href="{url}">{text}</a>'
def html_out(channel, sections):
"""create and write channel_name.html file"""
title = f'YouTube Channel {channel}'
f = open(f'{channel}.html','w')
template = ('<!doctype html>\n<html lang="en">\n<head>\n'
'<meta charset="utf-8">'
'<title>{}</title>\n</head>\n'
'<body>\n{}\n</body>\n</html>')
parts = list()
parts.append(tag('h1', title))
for s in sections:
parts.append(tag('h2',link(s['title'], s['link'])))
for pl in s['playlists']:
parts.append(tag('h3', link(pl['title'], pl['link'])))
if len(pl) == 0:
parts.append('<p>Empty Playlist</p>')
else:
parts.append('<ol>')
for v in pl['videos']:
t = '' if v['time'] == 'NA' else f" ({v['time']})"
parts.append(tag('li', link(v['title'],
v['short_link']) + t))
parts.append('</ol>')
f.write(template.format(channel, '\n'.join(parts)))
f.close()
def csv_out(channel, sections):
""" create and output channel_name.csv
file for import into a spreadsheet or DB"""
headers = ('channel,section,playlist,video,'
'link,time,views,publication date,'
'likes,dislikes,description').split(',')
with open(f'{channel}.csv', 'w') as csv_file:
csvf = csv.writer(csv_file, delimiter=',')
csvf.writerow(headers)
for section in sections:
for playlist in section['playlists']:
for video in playlist['videos']:
v = video
line = [channel,
section['title'],
playlist['title'],
v['title']]
line.extend([v['short_link'],
v['time'], v['views'],
v['publication_date'],
v['likes'], v['dislikes'],
v['description']])
csvf.writerow(line)
def process_channel(channel_name):
sections = channel_section_links(channel_name)
for section in sections:
section['playlists'] = get_playlists(section)
for playlist in section['playlists']:
add_videos(playlist)
return sections
if __name__ == '__main__':
# find channel name by going to channel
# and picking last element from channel url
# for example my channel url is:
# https://www.youtube.com/user/gjenkinslbcc
# my channel name is gjenkinslbcc in this url
# this is set near top of this file
# if the channel is of the form:
# https://www.youtube.com/channel/xyz then supply xyz
print(f'finding sections for youtube.com {channel_name}')
sections = process_channel(channel_name)
# save sections structure to json file
with open(f'{channel_name}.json','w') as f:
f.write(json.dumps(sections, sort_keys=True, indent=4))
html_out(channel_name, sections) # create web page of channel links
# create a csv file of video info for import into spreadsheet
csv_out(channel_name, sections)
print(f"Program Complete,\n '{channel_name}.html' and"
f" '{channel_name}.csv' have been"
f" written to current directory")
# 3456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
# 1 2 3 4 5 6 7 8 9
@gerryjenkinslb
Copy link
Author

Program to create webpage index of all playlists and videos from any YouTube channel.

Python program that will 'screen scrape' list of playlists and videos from a specified channel and create a web page index to them and a CSV file with video information. As well as a JSON file. I use this to create a nice page that links to all my videos. It can work with any channel although be aware that YouTube has a limit on the max videos per page that may affect very large playlists.

See my post: @ blog article

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment