Skip to content

Instantly share code, notes, and snippets.

@Nanguage
Created August 19, 2018 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Nanguage/8a8a53d10dad2520e183665e9ce8196c to your computer and use it in GitHub Desktop.
Save Nanguage/8a8a53d10dad2520e183665e9ce8196c to your computer and use it in GitHub Desktop.
Use you-get download youtube playlist
"""
Extract the urls from YouTube playlist page html file.
usage:
$ python extract_urls.py PLAYLIST.html OUTPUT.txt
"""
import re
import sys
input_html = sys.argv[1]
output = sys.argv[2]
with open(input_html) as f:
html_text = f.read()
pattern = "\"url\":\"(/watch.*?)\""
def process_url(url):
url_trimed = url.split("\\u")[0]
base_url = "https://www.youtube.com"
return base_url + url_trimed
with open(output, 'w') as fo:
for match in re.finditer(pattern, html_text):
url = match.groups()[0]
url = process_url(url)
fo.write("{}\n".format(url))
"""
Download urls with you-get.
Use asyncio and subprocess implement unblock download.
usage:
$ python youget_urls.py URL_LIST.txt
"""
MAX_CONTINUES = 5
import os
import sys
import subprocess
import asyncio
from queue import Queue
def load_urls(path):
with open(path) as f:
urls = [line.strip() for line in f]
urls_queue = Queue()
for url in urls:
urls_queue.put(url)
return urls_queue
COOKIE = "./cookie.json"
async def download_url(url):
env = os.environ.copy()
cmd_args = ["you-get", "-c", COOKIE, "{}".format(url)]
process = await asyncio.create_subprocess_exec(
*cmd_args,
stdout=asyncio.subprocess.PIPE,
env=env,
)
print(" ".join(cmd_args))
stdout, stderr = await process.communicate()
out = stdout.decode().strip()
res = (process.returncode, out, None)
return res
async def download_url_until_all_success(url_queue):
if url_queue.empty():
# queue is empty
return
else:
url = url_queue.get()
ret_code, out, err = await download_url(url)
if ret_code == 0:
# success
await download_url_until_all_success(url_queue)
else:
# failed
url_queue.put(url)
await download_url_until_all_success(url_queue)
if __name__ == "__main__":
input_file = sys.argv[1]
url_queue = load_urls(input_file)
loop = asyncio.get_event_loop()
tasks = [download_url_until_all_success(url_queue) for _ in range(MAX_CONTINUES)]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment