Skip to content

Instantly share code, notes, and snippets.

@cgarz
Created April 29, 2020 03:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cgarz/d79b27fcf14f74838cee55eb303542b6 to your computer and use it in GitHub Desktop.
Save cgarz/d79b27fcf14f74838cee55eb303542b6 to your computer and use it in GitHub Desktop.
Youtube watch later stats. Gets statistics and video links from your watch later playlist using python3, requests, BeautifulSoup and browser_cookie3 for authentication.
#!/usr/bin/env python3
# adapted from https://github.com/bulbipop/ytwlstats/blob/master/cmd.py
from bs4 import BeautifulSoup as bs
from datetime import datetime, timedelta
from argparse import ArgumentParser
from filecmp import cmp as compare
import requests
import browser_cookie3
import sys
import os
# PARSER = 'html.parser'
PARSER = 'lxml'
YT_PREFIX = 'http://www.youtube.com'
LINK_LIST_OUT = 'YT_WL_links.txt'
AUTO_NAME_PFX = 'YT_WL_stats_'
AUTO_NAME_DIR = 'stats'
# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
TIMESTAMP_FMT = '%Y-%m-%d_%H-%M-%S'
S_INDEX = 'Index....: '
S_TITLE = 'Title....: '
S_LINK = 'Link.....: '
S_UPLOADER = 'Uploader.: '
S_DURATION = 'Duration.: '
S_NO_DATA = 'N/A'
def req(url, cookie_jar):
print('Downloading ' + url)
return requests.get(url, cookies=cookie_jar)
def calc_durations(length, speeds=(1, 1.5, 2, 3)):
""" Generator to calculate length at different speeds """
for speed in speeds:
new_play_len = length / speed
ms = new_play_len.microseconds
new_play_len -= timedelta(microseconds=ms) # rounds to second
yield new_play_len, speed
def scrape_stats():
cj = browser_cookie3.firefox(domain_name='youtube.com')
url = YT_PREFIX + '/playlist?list=WL'
html = req(url, cj).text
button = bs(html, PARSER).find(class_='load-more-button')
# Separate runs needed. Thus saving ajax calls to html_extra. Appending ajax to html would apparently corrupt the
# html's structure, causing BeautifulSoup to fail to find pl-video class for all videos after the load more button
html_extra = ''
while button:
url = YT_PREFIX + button['data-uix-load-more-href']
ajax = req(url, cj).json()
html_extra += ajax['content_html']
button = bs(ajax['load_more_widget_html'], PARSER).button
soup_main = bs(html, PARSER)
soup_extra = bs(html_extra, PARSER) if html_extra else ''
vid_entries = []
total = timedelta()
count = 0
for soup in (soup_main, soup_extra):
if soup:
for video in soup.find_all(class_='pl-video'):
count += 1
vid_entry = {'index': str(count)}
for link in video.find_all(class_='pl-video-title-link'):
vid_entry['title'] = link.text.strip()
vid_entry['link'] = YT_PREFIX + link['href'].split('&')[0]
break
else:
print('WARNING: No link or video class "pl-title-link" in "pl-video". Skipping.')
continue
for owner in video.find_all(class_='pl-video-owner'):
vid_entry['uploader'] = owner.a.text.strip()
break
else:
vid_entry['uploader'] = S_NO_DATA
for timestamp in video.find_all(class_='timestamp'):
vid_entry['duration'] = timestamp.text.strip()
*h, m, s = list(map(int, timestamp.text.strip().split(':')))
total += timedelta(hours=next(iter(h), 0), minutes=m, seconds=s)
break
else:
vid_entry['duration'] = S_NO_DATA
vid_entries.append(vid_entry)
return vid_entries, total
def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)
parser = ArgumentParser(description='Scrapes various information from videos in a users YouTube watch later list.')
parser.add_argument('-f', '--clobber', action='store_true',
help='Force overwriting (clobbering) of output files. Default is to abort if file exists')
parser.add_argument('-o', '--out-file', nargs='?', type=str, default=None,
help='Output file to print all video stats to. Defaults to stdout.')
parser.add_argument('-a', '--auto-name', action='store_true',
help=(f'Set output filename and path automatically with format: '
f'"[SCRIPT_DIR]/{AUTO_NAME_DIR}/{AUTO_NAME_PFX}{TIMESTAMP_FMT.replace("%", "%%")}.txt". '
'Will also remove output file if last auto named file is identical.'))
args = parser.parse_args()
if args.auto_name:
if args.out_file:
parser.error('Cannot specify output file with auto output file naming.')
auto_name = f'{AUTO_NAME_DIR}{os.path.sep}{AUTO_NAME_PFX}{datetime.strftime(datetime.now(), TIMESTAMP_FMT)}.txt'
print('Auto name mode. Setting output filename to:', auto_name)
args.out_file = auto_name
if not os.path.isdir(AUTO_NAME_DIR):
os.mkdir(AUTO_NAME_DIR)
if not args.out_file or args.out_file == '-':
args.out_file = sys.stdout
elif not args.clobber and os.path.isfile(args.out_file):
parser.error('Output comment file exists. To force overwrite, specify --clobber (-f)')
else:
args.out_file = open(args.out_file, 'w')
print('Beginning WL scrape.')
vid_entries, total_playtime = scrape_stats()
print()
playtime_stats = [f'{len(vid_entries)} videos with a total duration of:']
for new_play_len, speed in calc_durations(total_playtime):
playtime_stats.append(f'{new_play_len} at {speed}x')
print('Writing video links to:', LINK_LIST_OUT)
with open(LINK_LIST_OUT, 'w') as f:
f.write('\n'.join([vid['link'] for vid in vid_entries]) + '\n')
print('Done.')
print()
if args.out_file != sys.stdout:
print('Writing videos stats to:', args.out_file.name)
for vid_entry in vid_entries:
args.out_file.write(S_INDEX + vid_entry['index'] + '\n')
args.out_file.write(S_TITLE + vid_entry['title'] + '\n')
args.out_file.write(S_LINK + vid_entry['link'] + '\n')
args.out_file.write(S_UPLOADER + vid_entry['uploader'] + '\n')
args.out_file.write(S_DURATION + vid_entry['duration'] + '\n')
args.out_file.write('\n')
args.out_file.write('\n'.join(playtime_stats) + '\n')
if args.out_file != sys.stdout:
print('Done.')
args.out_file.close()
print()
if args.auto_name:
stat_files = [f for f in os.listdir(AUTO_NAME_DIR) if f.startswith(AUTO_NAME_PFX)
and f.endswith('.txt') and not f == os.path.basename(auto_name)]
stat_files.sort()
last_stat_file = os.path.join(AUTO_NAME_DIR, stat_files[-1])
if compare(last_stat_file, auto_name, shallow=False):
print('Auto named file identical to previous.', '\nPrevious:', last_stat_file)
print('Removing:', auto_name)
os.remove(auto_name)
print()
if args.out_file != sys.stdout:
print(' '.join(playtime_stats[:2]))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment