Skip to content

Instantly share code, notes, and snippets.

@idlesign
Last active May 26, 2019 09:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save idlesign/6554f469dda60f4d52b033d95a5f839a to your computer and use it in GitHub Desktop.
Save idlesign/6554f469dda60f4d52b033d95a5f839a to your computer and use it in GitHub Desktop.
Finds YouTube videos you're interested in. PyCon US talks finder example.
import re
import requests
import html
API_KEY = ''
'''Google API (YouTube Data API v3) key from https://console.developers.google.com/apis/.'''
# Put titles you're interested into RELEVANT string:
# one title per line. These are regular expressions.
# NOTE: escape (), [], etc.
RELEVANT = '''
'''
CHANNEL = 'UCxs2IIVXaEHHA4BtTiWZ2mQ'
'''YouTube channel ID here.'''
TITLE_POSTFIX = 'PyCon 2019'
'''Postfix to strip from titles.'''
#############################################
_SPACES = re.compile('\s+')
_BASE_URL = 'https://www.googleapis.com/youtube/v3/search?order=date&part=snippet&channelId=%(channel)s&maxResults=50&key=%(key)s%(page)s'
def traverse(page=0):
params = {
'channel': CHANNEL,
'key': API_KEY,
'page': '',
}
if page:
params['page'] = '&pageToken=%s' % page
url = _BASE_URL % params
response = requests.get(url)
json = response.json()
error = json.get('error')
if error:
raise Exception(error['message'])
next_page = json.get('nextPageToken')
for item in json['items']:
if item['id']['kind'] != 'youtube#video':
continue
video_id = item['id']['videoId']
title = html.unescape(item['snippet']['title'])
title = title.replace(TITLE_POSTFIX, '').strip(' -')
split = title.split(' - ', 1)
prefix = split[0]
prefix = prefix.replace('/', ',')
if len(prefix.split(' ')) in {2, 3} or (',' in prefix):
# strip person name
try:
title = split[1]
except IndexError:
pass
title = _SPACES.sub(' ', title)
title = title.strip(' -')
yield video_id, title
if next_page:
yield from traverse(page=next_page)
def find_relevant():
print('Channel: https://www.youtube.com/channel/%s/videos?view=0&sort=dd&flow=list\n' % CHANNEL)
relevant_lines = []
for line in RELEVANT.splitlines():
line = line.strip()
if line:
line = _SPACES.sub(' ', line)
relevant_lines.append(line)
total_relevant = len(relevant_lines)
total_traversed = 0
seen = []
traversed = [item for item in traverse()][::-1] # eldest first
len_traversed = len(str(len(traversed)))
for video_id, title in traversed:
total_traversed += 1
matched = False
for line in relevant_lines:
matched = re.match(line, title)
if matched:
relevant_lines.remove(line)
url = 'https://youtu.be/%s' % video_id
seen.append((title, url, bool(matched)))
for idx, seen_item in enumerate(sorted(seen), 1):
title, url, matched = seen_item
print('%s. %s%s' % (str(idx).zfill(len_traversed), '[!] ' if matched else '', title))
if url:
print('%s%s' % (' ' * (len_traversed + 2), url))
total_missing = len(relevant_lines)
print(
'\nSummary: among %s found %s of %s, missing %s\n' % (
total_traversed,
total_relevant - total_missing,
total_relevant,
total_missing
))
print('====' * 20)
print('Missing:\n')
for idx, line in enumerate(sorted(relevant_lines), 1):
print('%s. %s' % (idx, line))
find_relevant()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment