Last active
September 30, 2022 23:13
-
-
Save subtleGradient/76d71f9bc8d5d947e1494c2f113d8e46 to your computer and use it in GitHub Desktop.
Command line python script to search YouTube
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import json | |
import os | |
def install(): | |
os.environ['PYPPETEER_CHROMIUM_REVISION'] = '1052357' | |
os.system('python3 -m pip install -q pyppeteer pyppeteer_stealth') | |
os.system('pyppeteer-install') | |
async def get_videos(): | |
'''get all the video links and metadata from the page''' | |
global page | |
return await page.evaluate('''() => { | |
try { | |
return Array.from(document.querySelectorAll("ytd-video-renderer")).map(video => ({ | |
title: video.querySelector("#video-title")?.innerText.trim(), | |
url: video.querySelector("#video-title")?.href.trim(), | |
length: video.querySelector("ytd-thumbnail-overlay-time-status-renderer")?.innerText.trim(), | |
views: video.querySelector("#metadata-line span:nth-child(1)")?.innerText.trim(), | |
date: video.querySelector("#metadata-line span:nth-child(2)")?.innerText.trim(), | |
channel_name: video.querySelector("ytd-channel-name a")?.textContent.trim(), | |
channel_url: video.querySelector("ytd-channel-name a")?.href.trim(), | |
})) | |
} catch (error) { | |
return [{ error: error.message }] | |
} | |
}''') | |
def url_for_youtube_search(search_query, over_20_min=True): | |
import urllib.parse | |
url = 'https://www.youtube.com/results?search_query=jason+statham+interview&sp=EgIYAg%253D%253D' | |
# replace the querystring with an encoded version of the new querystring | |
url_parts = urllib.parse.urlsplit(url) | |
query = urllib.parse.parse_qs(urllib.parse.urlparse(url).query) | |
query['search_query'] = search_query | |
if over_20_min: | |
query['sp'] = 'EgIYAg%3D%3D' | |
else: | |
del query['sp'] | |
new_querystring = urllib.parse.urlencode(query, doseq=True) | |
url_parts = url_parts._replace(query=new_querystring) | |
url = urllib.parse.urlunsplit(url_parts) | |
return url | |
assert url_for_youtube_search( | |
'jason statham interview') == 'https://www.youtube.com/results?search_query=jason+statham+interview&sp=EgIYAg%253D%253D' | |
assert url_for_youtube_search( | |
'jason statham interview', over_20_min=False) == 'https://www.youtube.com/results?search_query=jason+statham+interview' | |
# for each line of input, search youtube and print the results | |
async def search_youtube(search_string="jason statham interview"): | |
global browser, page | |
import pyppeteer | |
from pyppeteer import launch | |
from pyppeteer_stealth import stealth | |
chromiumPathLatest = pyppeteer.executablePath().replace( | |
pyppeteer.__chromium_revision__, '1052357') | |
browser = await launch({'headless': True, 'executablePath': chromiumPathLatest}) | |
page = await browser.newPage() | |
await stealth(page) | |
await page.goto(url_for_youtube_search(search_string)) | |
videos = await get_videos() | |
return videos | |
# run the script | |
if __name__ == "__main__": | |
# parse --skip-install and -q search_string arguments | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--skip-install', action='store_true', | |
help='skip installing dependencies') | |
parser.add_argument('-s', '--search', | |
help='the query to search youtube with', required=True) | |
# format json, jsonl, or youtube-dl compatible text | |
parser.add_argument('-f', '--format', choices=['json', 'jsonl', 'youtube-dl', 'csv'], | |
help='the format to print the results in', default='youtube-dl') | |
args = parser.parse_args() | |
if not args.skip_install: | |
install() | |
import asyncio | |
videos = asyncio.get_event_loop().run_until_complete( | |
search_youtube(args.search)) | |
if args.format == 'json': | |
print(json.dumps(videos, indent=4)) | |
elif args.format == 'jsonl': | |
for video in videos: | |
print(json.dumps(video)) | |
elif args.format == 'youtube-dl': | |
print(f'# Search results for "{args.search}"') | |
for video in videos: | |
print('# ' + json.dumps(video)) | |
print(video['url']) | |
print('# end\n') | |
elif args.format == 'csv': | |
import csv | |
import sys | |
writer = csv.writer(sys.stdout) | |
writer.writerow(['title', 'url', 'length', 'views', | |
'date', 'channel_name', 'channel_url']) | |
for video in videos: | |
writer.writerow([video['title'], video['url'], video['length'], video['views'], | |
video['date'], video['channel_name'], video['channel_url']]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The code above does the following, explained in English: