Nanguage/extract_urls.py

## extract_urls.py
"""
Extract the urls from YouTube playlist page html file.

usage:
    $ python extract_urls.py PLAYLIST.html OUTPUT.txt
"""

import re
import sys

input_html = sys.argv[1]
output = sys.argv[2]

with open(input_html) as f:
    html_text = f.read()

pattern = "\"url\":\"(/watch.*?)\""

def process_url(url):
    url_trimed = url.split("\\u")[0]
    base_url = "https://www.youtube.com"
    return base_url + url_trimed

with open(output, 'w') as fo:
    for match in re.finditer(pattern, html_text):
        url = match.groups()[0]
        url = process_url(url)
        fo.write("{}\n".format(url))

## youget_urls.py
"""
Download urls with you-get.

Use asyncio and subprocess implement unblock download.

usage:
    $ python youget_urls.py URL_LIST.txt
"""

MAX_CONTINUES = 5

import os
import sys
import subprocess
import asyncio
from queue import Queue

def load_urls(path):
    with open(path) as f:
        urls = [line.strip() for line in f]
    urls_queue = Queue()
    for url in urls:
        urls_queue.put(url)
    return urls_queue

COOKIE = "./cookie.json"
async def download_url(url):
    env = os.environ.copy()
    cmd_args = ["you-get", "-c", COOKIE, "{}".format(url)]
    process = await asyncio.create_subprocess_exec(
        *cmd_args,
        stdout=asyncio.subprocess.PIPE,
        env=env,
        )


    print(" ".join(cmd_args))
    stdout, stderr = await process.communicate()

    out = stdout.decode().strip()
    res = (process.returncode, out, None)
    return res

async def download_url_until_all_success(url_queue):
    if url_queue.empty():
        # queue is empty
        return
    else:
        url = url_queue.get()

    ret_code, out, err = await download_url(url)
    if ret_code == 0:
        # success
        await download_url_until_all_success(url_queue)
    else:
        # failed
        url_queue.put(url)
        await download_url_until_all_success(url_queue)

if __name__ == "__main__":
    input_file = sys.argv[1]

    url_queue = load_urls(input_file)
    loop = asyncio.get_event_loop()
    tasks = [download_url_until_all_success(url_queue) for _ in range(MAX_CONTINUES)]
    tasks = asyncio.gather(*tasks)
    loop.run_until_complete(tasks)
	"""
	Extract the urls from YouTube playlist page html file.

	usage:
	$ python extract_urls.py PLAYLIST.html OUTPUT.txt
	"""

	import re
	import sys

	input_html = sys.argv[1]
	output = sys.argv[2]

	with open(input_html) as f:
	html_text = f.read()

	pattern = "\"url\":\"(/watch.*?)\""

	def process_url(url):
	url_trimed = url.split("\\u")[0]
	base_url = "https://www.youtube.com"
	return base_url + url_trimed

	with open(output, 'w') as fo:
	for match in re.finditer(pattern, html_text):
	url = match.groups()[0]
	url = process_url(url)
	fo.write("{}\n".format(url))
	"""
	Download urls with you-get.

	Use asyncio and subprocess implement unblock download.

	usage:
	$ python youget_urls.py URL_LIST.txt
	"""

	MAX_CONTINUES = 5

	import os
	import sys
	import subprocess
	import asyncio
	from queue import Queue

	def load_urls(path):
	with open(path) as f:
	urls = [line.strip() for line in f]
	urls_queue = Queue()
	for url in urls:
	urls_queue.put(url)
	return urls_queue

	COOKIE = "./cookie.json"
	async def download_url(url):
	env = os.environ.copy()
	cmd_args = ["you-get", "-c", COOKIE, "{}".format(url)]
	process = await asyncio.create_subprocess_exec(
	*cmd_args,
	stdout=asyncio.subprocess.PIPE,
	env=env,
	)


	print(" ".join(cmd_args))
	stdout, stderr = await process.communicate()

	out = stdout.decode().strip()
	res = (process.returncode, out, None)
	return res

	async def download_url_until_all_success(url_queue):
	if url_queue.empty():
	# queue is empty
	return
	else:
	url = url_queue.get()

	ret_code, out, err = await download_url(url)
	if ret_code == 0:
	# success
	await download_url_until_all_success(url_queue)
	else:
	# failed
	url_queue.put(url)
	await download_url_until_all_success(url_queue)

	if __name__ == "__main__":
	input_file = sys.argv[1]

	url_queue = load_urls(input_file)
	loop = asyncio.get_event_loop()
	tasks = [download_url_until_all_success(url_queue) for _ in range(MAX_CONTINUES)]
	tasks = asyncio.gather(*tasks)
	loop.run_until_complete(tasks)