Last active
May 26, 2019 09:22
-
-
Save idlesign/6554f469dda60f4d52b033d95a5f839a to your computer and use it in GitHub Desktop.
Finds YouTube videos you're interested in. PyCon US talks finder example.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import requests | |
import html | |
API_KEY = '' | |
'''Google API (YouTube Data API v3) key from https://console.developers.google.com/apis/.''' | |
# Put titles you're interested into RELEVANT string: | |
# one title per line. These are regular expressions. | |
# NOTE: escape (), [], etc. | |
RELEVANT = ''' | |
''' | |
CHANNEL = 'UCxs2IIVXaEHHA4BtTiWZ2mQ' | |
'''YouTube channel ID here.''' | |
TITLE_POSTFIX = 'PyCon 2019' | |
'''Postfix to strip from titles.''' | |
############################################# | |
_SPACES = re.compile('\s+') | |
_BASE_URL = 'https://www.googleapis.com/youtube/v3/search?order=date&part=snippet&channelId=%(channel)s&maxResults=50&key=%(key)s%(page)s' | |
def traverse(page=0): | |
params = { | |
'channel': CHANNEL, | |
'key': API_KEY, | |
'page': '', | |
} | |
if page: | |
params['page'] = '&pageToken=%s' % page | |
url = _BASE_URL % params | |
response = requests.get(url) | |
json = response.json() | |
error = json.get('error') | |
if error: | |
raise Exception(error['message']) | |
next_page = json.get('nextPageToken') | |
for item in json['items']: | |
if item['id']['kind'] != 'youtube#video': | |
continue | |
video_id = item['id']['videoId'] | |
title = html.unescape(item['snippet']['title']) | |
title = title.replace(TITLE_POSTFIX, '').strip(' -') | |
split = title.split(' - ', 1) | |
prefix = split[0] | |
prefix = prefix.replace('/', ',') | |
if len(prefix.split(' ')) in {2, 3} or (',' in prefix): | |
# strip person name | |
try: | |
title = split[1] | |
except IndexError: | |
pass | |
title = _SPACES.sub(' ', title) | |
title = title.strip(' -') | |
yield video_id, title | |
if next_page: | |
yield from traverse(page=next_page) | |
def find_relevant(): | |
print('Channel: https://www.youtube.com/channel/%s/videos?view=0&sort=dd&flow=list\n' % CHANNEL) | |
relevant_lines = [] | |
for line in RELEVANT.splitlines(): | |
line = line.strip() | |
if line: | |
line = _SPACES.sub(' ', line) | |
relevant_lines.append(line) | |
total_relevant = len(relevant_lines) | |
total_traversed = 0 | |
seen = [] | |
traversed = [item for item in traverse()][::-1] # eldest first | |
len_traversed = len(str(len(traversed))) | |
for video_id, title in traversed: | |
total_traversed += 1 | |
matched = False | |
for line in relevant_lines: | |
matched = re.match(line, title) | |
if matched: | |
relevant_lines.remove(line) | |
url = 'https://youtu.be/%s' % video_id | |
seen.append((title, url, bool(matched))) | |
for idx, seen_item in enumerate(sorted(seen), 1): | |
title, url, matched = seen_item | |
print('%s. %s%s' % (str(idx).zfill(len_traversed), '[!] ' if matched else '', title)) | |
if url: | |
print('%s%s' % (' ' * (len_traversed + 2), url)) | |
total_missing = len(relevant_lines) | |
print( | |
'\nSummary: among %s found %s of %s, missing %s\n' % ( | |
total_traversed, | |
total_relevant - total_missing, | |
total_relevant, | |
total_missing | |
)) | |
print('====' * 20) | |
print('Missing:\n') | |
for idx, line in enumerate(sorted(relevant_lines), 1): | |
print('%s. %s' % (idx, line)) | |
find_relevant() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment