Skip to content

Instantly share code, notes, and snippets.

@hiromu
Created May 25, 2023 04:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hiromu/4ffd45dba8ce18c8334f6c340267dae6 to your computer and use it in GitHub Desktop.
Save hiromu/4ffd45dba8ce18c8334f6c340267dae6 to your computer and use it in GitHub Desktop.
A script to download videos listed in a tab-separated file with Selenium.
#!/usr/bin/env python
import csv
import json
import pathlib
import sys
import time
import tempfile
import trio
from m3u8downloader.main import M3u8Downloader
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
async def accessor(list_tsv, channel):
capabilities = webdriver.DesiredCapabilities.CHROME
capabilities['goog:loggingPrefs'] = {'performance': 'ALL'}
options = Options()
options.binary_location = '/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta'
options.add_argument('--disable-headless-mode')
options.add_argument(f'--user-data-dir={pathlib.Path(__file__).parent / "profile"}')
driver = webdriver.Chrome(options=options, desired_capabilities=capabilities)
with open(list_tsv) as list_fp:
for url, title in csv.reader(list_fp, delimiter='\t'):
print('==> Start processing:', title)
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'video')))
driver.execute_script('Array.from(document.getElementsByTagName("video")).forEach(e => e.pause())')
while True:
time.sleep(1)
for entry_json in driver.get_log('performance'):
entry = json.loads(entry_json['message'])
if entry['message']['method'] != 'Network.requestWillBeSent':
continue
if 'playlist.m3u8' in entry['message']['params']['request']['url']:
await channel.send((entry['message']['params']['request']['url'], title))
await channel.send((None, None))
break
else:
continue
break
async def downloader(channel):
async for url, title in channel:
if title is None:
continue
print('==> Start downloading:', title)
with tempfile.TemporaryDirectory() as tempdir:
downloader = M3u8Downloader(url, title + '.mp4', tempdir=tempdir, poolsize=2)
downloader.start()
async def main(list_tsv):
send_channel, recv_channel = trio.open_memory_channel(1)
async with trio.open_nursery() as nursery:
nursery.start_soon(accessor, list_tsv, send_channel)
nursery.start_soon(downloader, recv_channel)
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f'{sys.argv[0]} list_tsv')
sys.exit(-1)
trio.run(main, sys.argv[1])
async-generator==1.10
attrs==23.1.0
certifi==2023.5.7
charset-normalizer==3.1.0
exceptiongroup==1.1.1
h11==0.14.0
idna==3.4
m3u8downloader==0.10.1
outcome==1.2.0
PySocks==1.7.1
requests==2.31.0
selenium==4.9.1
sniffio==1.3.0
sortedcontainers==2.4.0
trio==0.22.0
trio-websocket==0.10.2
urllib3==2.0.2
wells==1.5.0
wsproto==1.2.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment