cgarz/ytwls.py

## ytwls.py
#!/usr/bin/env python3

# adapted from https://github.com/bulbipop/ytwlstats/blob/master/cmd.py

from bs4 import BeautifulSoup as bs
from datetime import datetime, timedelta
from argparse import ArgumentParser
from filecmp import cmp as compare
import requests
import browser_cookie3
import sys
import os

# PARSER = 'html.parser'
PARSER = 'lxml'
YT_PREFIX = 'http://www.youtube.com'

LINK_LIST_OUT = 'YT_WL_links.txt'
AUTO_NAME_PFX = 'YT_WL_stats_'
AUTO_NAME_DIR = 'stats'

# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
TIMESTAMP_FMT = '%Y-%m-%d_%H-%M-%S'

S_INDEX    = 'Index....: '
S_TITLE    = 'Title....: '
S_LINK     = 'Link.....: '
S_UPLOADER = 'Uploader.: '
S_DURATION = 'Duration.: '
S_NO_DATA = 'N/A'


def req(url, cookie_jar):
    print('Downloading ' + url)
    return requests.get(url, cookies=cookie_jar)


def calc_durations(length, speeds=(1, 1.5, 2, 3)):
    """ Generator to calculate length at different speeds """
    for speed in speeds:
        new_play_len = length / speed
        ms = new_play_len.microseconds
        new_play_len -= timedelta(microseconds=ms)  # rounds to second
        yield new_play_len, speed


def scrape_stats():
    cj = browser_cookie3.firefox(domain_name='youtube.com')
    url = YT_PREFIX + '/playlist?list=WL'
    html = req(url, cj).text
    button = bs(html, PARSER).find(class_='load-more-button')

    # Separate runs needed. Thus saving ajax calls to html_extra. Appending ajax to html would apparently corrupt the
    # html's structure, causing BeautifulSoup to fail to find pl-video class for all videos after the load more button
    html_extra = ''
    while button:
        url = YT_PREFIX + button['data-uix-load-more-href']
        ajax = req(url, cj).json()
        html_extra += ajax['content_html']
        button = bs(ajax['load_more_widget_html'], PARSER).button

    soup_main = bs(html, PARSER)
    soup_extra = bs(html_extra, PARSER) if html_extra else ''
    vid_entries = []
    total = timedelta()
    count = 0
    for soup in (soup_main, soup_extra):
        if soup:
            for video in soup.find_all(class_='pl-video'):
                count += 1
                vid_entry = {'index': str(count)}

                for link in video.find_all(class_='pl-video-title-link'):
                    vid_entry['title'] = link.text.strip()
                    vid_entry['link'] = YT_PREFIX + link['href'].split('&')[0]
                    break
                else:
                    print('WARNING: No link or video class "pl-title-link" in "pl-video". Skipping.')
                    continue

                for owner in video.find_all(class_='pl-video-owner'):
                    vid_entry['uploader'] = owner.a.text.strip()
                    break
                else:
                    vid_entry['uploader'] = S_NO_DATA

                for timestamp in video.find_all(class_='timestamp'):
                    vid_entry['duration'] = timestamp.text.strip()
                    *h, m, s = list(map(int, timestamp.text.strip().split(':')))
                    total += timedelta(hours=next(iter(h), 0), minutes=m, seconds=s)
                    break
                else:
                    vid_entry['duration'] = S_NO_DATA

                vid_entries.append(vid_entry)

    return vid_entries, total


def main():
    script_dir = os.path.dirname(os.path.abspath(__file__))
    os.chdir(script_dir)
    parser = ArgumentParser(description='Scrapes various information from videos in a users YouTube watch later list.')
    parser.add_argument('-f', '--clobber', action='store_true',
                        help='Force overwriting (clobbering) of output files. Default is to abort if file exists')
    parser.add_argument('-o', '--out-file', nargs='?', type=str, default=None,
                        help='Output file to print all video stats to. Defaults to stdout.')
    parser.add_argument('-a', '--auto-name', action='store_true',
                        help=(f'Set output filename and path automatically with format: '
                              f'"[SCRIPT_DIR]/{AUTO_NAME_DIR}/{AUTO_NAME_PFX}{TIMESTAMP_FMT.replace("%", "%%")}.txt". '
                              'Will also remove output file if last auto named file is identical.'))
    args = parser.parse_args()

    if args.auto_name:
        if args.out_file:
            parser.error('Cannot specify output file with auto output file naming.')
        auto_name = f'{AUTO_NAME_DIR}{os.path.sep}{AUTO_NAME_PFX}{datetime.strftime(datetime.now(), TIMESTAMP_FMT)}.txt'
        print('Auto name mode. Setting output filename to:', auto_name)
        args.out_file = auto_name
        if not os.path.isdir(AUTO_NAME_DIR):
            os.mkdir(AUTO_NAME_DIR)

    if not args.out_file or args.out_file == '-':
        args.out_file = sys.stdout
    elif not args.clobber and os.path.isfile(args.out_file):
        parser.error('Output comment file exists. To force overwrite, specify --clobber (-f)')
    else:
        args.out_file = open(args.out_file, 'w')

    print('Beginning WL scrape.')
    vid_entries, total_playtime = scrape_stats()
    print()

    playtime_stats = [f'{len(vid_entries)} videos with a total duration of:']
    for new_play_len, speed in calc_durations(total_playtime):
        playtime_stats.append(f'{new_play_len} at {speed}x')

    print('Writing video links to:', LINK_LIST_OUT)
    with open(LINK_LIST_OUT, 'w') as f:
        f.write('\n'.join([vid['link'] for vid in vid_entries]) + '\n')
    print('Done.')
    print()
    if args.out_file != sys.stdout:
        print('Writing videos stats to:', args.out_file.name)
    for vid_entry in vid_entries:
        args.out_file.write(S_INDEX    + vid_entry['index'] + '\n')
        args.out_file.write(S_TITLE    + vid_entry['title'] + '\n')
        args.out_file.write(S_LINK     + vid_entry['link'] + '\n')
        args.out_file.write(S_UPLOADER + vid_entry['uploader'] + '\n')
        args.out_file.write(S_DURATION + vid_entry['duration'] + '\n')
        args.out_file.write('\n')
    args.out_file.write('\n'.join(playtime_stats) + '\n')
    if args.out_file != sys.stdout:
        print('Done.')
        args.out_file.close()
    print()

    if args.auto_name:
        stat_files = [f for f in os.listdir(AUTO_NAME_DIR) if f.startswith(AUTO_NAME_PFX)
                      and f.endswith('.txt') and not f == os.path.basename(auto_name)]
        stat_files.sort()
        last_stat_file = os.path.join(AUTO_NAME_DIR, stat_files[-1])
        if compare(last_stat_file, auto_name, shallow=False):
            print('Auto named file identical to previous.', '\nPrevious:', last_stat_file)
            print('Removing:', auto_name)
            os.remove(auto_name)
            print()

    if args.out_file != sys.stdout:
        print(' '.join(playtime_stats[:2]))


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	# adapted from https://github.com/bulbipop/ytwlstats/blob/master/cmd.py

	from bs4 import BeautifulSoup as bs
	from datetime import datetime, timedelta
	from argparse import ArgumentParser
	from filecmp import cmp as compare
	import requests
	import browser_cookie3
	import sys
	import os

	# PARSER = 'html.parser'
	PARSER = 'lxml'
	YT_PREFIX = 'http://www.youtube.com'

	LINK_LIST_OUT = 'YT_WL_links.txt'
	AUTO_NAME_PFX = 'YT_WL_stats_'
	AUTO_NAME_DIR = 'stats'

	# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
	TIMESTAMP_FMT = '%Y-%m-%d_%H-%M-%S'

	S_INDEX = 'Index....: '
	S_TITLE = 'Title....: '
	S_LINK = 'Link.....: '
	S_UPLOADER = 'Uploader.: '
	S_DURATION = 'Duration.: '
	S_NO_DATA = 'N/A'


	def req(url, cookie_jar):
	print('Downloading ' + url)
	return requests.get(url, cookies=cookie_jar)


	def calc_durations(length, speeds=(1, 1.5, 2, 3)):
	""" Generator to calculate length at different speeds """
	for speed in speeds:
	new_play_len = length / speed
	ms = new_play_len.microseconds
	new_play_len -= timedelta(microseconds=ms) # rounds to second
	yield new_play_len, speed


	def scrape_stats():
	cj = browser_cookie3.firefox(domain_name='youtube.com')
	url = YT_PREFIX + '/playlist?list=WL'
	html = req(url, cj).text
	button = bs(html, PARSER).find(class_='load-more-button')

	# Separate runs needed. Thus saving ajax calls to html_extra. Appending ajax to html would apparently corrupt the
	# html's structure, causing BeautifulSoup to fail to find pl-video class for all videos after the load more button
	html_extra = ''
	while button:
	url = YT_PREFIX + button['data-uix-load-more-href']
	ajax = req(url, cj).json()
	html_extra += ajax['content_html']
	button = bs(ajax['load_more_widget_html'], PARSER).button

	soup_main = bs(html, PARSER)
	soup_extra = bs(html_extra, PARSER) if html_extra else ''
	vid_entries = []
	total = timedelta()
	count = 0
	for soup in (soup_main, soup_extra):
	if soup:
	for video in soup.find_all(class_='pl-video'):
	count += 1
	vid_entry = {'index': str(count)}

	for link in video.find_all(class_='pl-video-title-link'):
	vid_entry['title'] = link.text.strip()
	vid_entry['link'] = YT_PREFIX + link['href'].split('&')[0]
	break
	else:
	print('WARNING: No link or video class "pl-title-link" in "pl-video". Skipping.')
	continue

	for owner in video.find_all(class_='pl-video-owner'):
	vid_entry['uploader'] = owner.a.text.strip()
	break
	else:
	vid_entry['uploader'] = S_NO_DATA

	for timestamp in video.find_all(class_='timestamp'):
	vid_entry['duration'] = timestamp.text.strip()
	*h, m, s = list(map(int, timestamp.text.strip().split(':')))
	total += timedelta(hours=next(iter(h), 0), minutes=m, seconds=s)
	break
	else:
	vid_entry['duration'] = S_NO_DATA

	vid_entries.append(vid_entry)

	return vid_entries, total


	def main():
	script_dir = os.path.dirname(os.path.abspath(__file__))
	os.chdir(script_dir)
	parser = ArgumentParser(description='Scrapes various information from videos in a users YouTube watch later list.')
	parser.add_argument('-f', '--clobber', action='store_true',
	help='Force overwriting (clobbering) of output files. Default is to abort if file exists')
	parser.add_argument('-o', '--out-file', nargs='?', type=str, default=None,
	help='Output file to print all video stats to. Defaults to stdout.')
	parser.add_argument('-a', '--auto-name', action='store_true',
	help=(f'Set output filename and path automatically with format: '
	f'"[SCRIPT_DIR]/{AUTO_NAME_DIR}/{AUTO_NAME_PFX}{TIMESTAMP_FMT.replace("%", "%%")}.txt". '
	'Will also remove output file if last auto named file is identical.'))
	args = parser.parse_args()

	if args.auto_name:
	if args.out_file:
	parser.error('Cannot specify output file with auto output file naming.')
	auto_name = f'{AUTO_NAME_DIR}{os.path.sep}{AUTO_NAME_PFX}{datetime.strftime(datetime.now(), TIMESTAMP_FMT)}.txt'
	print('Auto name mode. Setting output filename to:', auto_name)
	args.out_file = auto_name
	if not os.path.isdir(AUTO_NAME_DIR):
	os.mkdir(AUTO_NAME_DIR)

	if not args.out_file or args.out_file == '-':
	args.out_file = sys.stdout
	elif not args.clobber and os.path.isfile(args.out_file):
	parser.error('Output comment file exists. To force overwrite, specify --clobber (-f)')
	else:
	args.out_file = open(args.out_file, 'w')

	print('Beginning WL scrape.')
	vid_entries, total_playtime = scrape_stats()
	print()

	playtime_stats = [f'{len(vid_entries)} videos with a total duration of:']
	for new_play_len, speed in calc_durations(total_playtime):
	playtime_stats.append(f'{new_play_len} at {speed}x')

	print('Writing video links to:', LINK_LIST_OUT)
	with open(LINK_LIST_OUT, 'w') as f:
	f.write('\n'.join([vid['link'] for vid in vid_entries]) + '\n')
	print('Done.')
	print()
	if args.out_file != sys.stdout:
	print('Writing videos stats to:', args.out_file.name)
	for vid_entry in vid_entries:
	args.out_file.write(S_INDEX + vid_entry['index'] + '\n')
	args.out_file.write(S_TITLE + vid_entry['title'] + '\n')
	args.out_file.write(S_LINK + vid_entry['link'] + '\n')
	args.out_file.write(S_UPLOADER + vid_entry['uploader'] + '\n')
	args.out_file.write(S_DURATION + vid_entry['duration'] + '\n')
	args.out_file.write('\n')
	args.out_file.write('\n'.join(playtime_stats) + '\n')
	if args.out_file != sys.stdout:
	print('Done.')
	args.out_file.close()
	print()

	if args.auto_name:
	stat_files = [f for f in os.listdir(AUTO_NAME_DIR) if f.startswith(AUTO_NAME_PFX)
	and f.endswith('.txt') and not f == os.path.basename(auto_name)]
	stat_files.sort()
	last_stat_file = os.path.join(AUTO_NAME_DIR, stat_files[-1])
	if compare(last_stat_file, auto_name, shallow=False):
	print('Auto named file identical to previous.', '\nPrevious:', last_stat_file)
	print('Removing:', auto_name)
	os.remove(auto_name)
	print()

	if args.out_file != sys.stdout:
	print(' '.join(playtime_stats[:2]))


	if __name__ == '__main__':
	main()