knzm/get_timedtext.py

## get_timedtext.py
import re
import operator
import json

import requests
import lxml.etree
import lxml.html


def collect_global_vars(doc):
    namespace = {}

    for s in doc.cssselect('body script'):
        if s.text is None:
            continue
        src = s.text.strip()
        m = re.match(r'^window\["', src, re.DOTALL)
        if m is None:
            continue

        chunks = []

        it = re.finditer(r'window\[".*?"\]\s*=\s*', src)
        indices = [m.start() for m in it]
        texts = [src[s:e] for s, e in zip(indices, indices[1:] + [len(src)])]
        for text in texts:
            m = re.match(r'^window\["(.*?)"\]\s*=\s*', text.rstrip(), re.DOTALL)
            name = m.group(1)
            offset = m.end()
            dec = json.JSONDecoder()
            obj, pos = dec.raw_decode(text[offset:])
            rest = re.sub(r'^;?\s*', '', text[offset+pos:])
            chunks.append((name, obj, rest))

        for i, (name, obj, rest) in enumerate(chunks):
            if rest != '' and i + 1 != len(chunks):
                raise ValueError()
            namespace[name] = obj

    return namespace


def find_track(tracks, lang):
    candidates = []
    for track in tracks:
        if track['languageCode'] != lang:
            continue
        is_generated = track.get('kind', '') == 'asr'
        candidates.append((is_generated, track))
    if len(candidates) == 0:
        return None
    return sorted(candidates, key=operator.itemgetter(0))[0][1]


def main(video_id, lang='ja'):
    params = {
        'v': video_id,
    }
    r = requests.get('https://www.youtube.com/watch', params=params)
    doc = lxml.html.fromstring(r.content.decode('utf-8'))

    namespace = collect_global_vars(doc)
    captions = namespace['ytInitialPlayerResponse']['captions']
    tracks = captions['playerCaptionsTracklistRenderer']['captionTracks']

    track = find_track(tracks, lang)
    # lang_name = track['name']['simpleText']

    timedtext_url = track['baseUrl']
    r = requests.get(timedtext_url)
    doc = lxml.etree.fromstring(r.content)
    for node in doc.xpath('/transcript/text'):
        print(node.text)


if __name__ == '__main__':
    import sys
    main(*sys.argv[1:])
	import re
	import operator
	import json

	import requests
	import lxml.etree
	import lxml.html


	def collect_global_vars(doc):
	namespace = {}

	for s in doc.cssselect('body script'):
	if s.text is None:
	continue
	src = s.text.strip()
	m = re.match(r'^window\["', src, re.DOTALL)
	if m is None:
	continue

	chunks = []

	it = re.finditer(r'window\[".?"\]\s=\s*', src)
	indices = [m.start() for m in it]
	texts = [src[s:e] for s, e in zip(indices, indices[1:] + [len(src)])]
	for text in texts:
	m = re.match(r'^window\["(.?)"\]\s=\s*', text.rstrip(), re.DOTALL)
	name = m.group(1)
	offset = m.end()
	dec = json.JSONDecoder()
	obj, pos = dec.raw_decode(text[offset:])
	rest = re.sub(r'^;?\s*', '', text[offset+pos:])
	chunks.append((name, obj, rest))

	for i, (name, obj, rest) in enumerate(chunks):
	if rest != '' and i + 1 != len(chunks):
	raise ValueError()
	namespace[name] = obj

	return namespace


	def find_track(tracks, lang):
	candidates = []
	for track in tracks:
	if track['languageCode'] != lang:
	continue
	is_generated = track.get('kind', '') == 'asr'
	candidates.append((is_generated, track))
	if len(candidates) == 0:
	return None
	return sorted(candidates, key=operator.itemgetter(0))[0][1]


	def main(video_id, lang='ja'):
	params = {
	'v': video_id,
	}
	r = requests.get('https://www.youtube.com/watch', params=params)
	doc = lxml.html.fromstring(r.content.decode('utf-8'))

	namespace = collect_global_vars(doc)
	captions = namespace['ytInitialPlayerResponse']['captions']
	tracks = captions['playerCaptionsTracklistRenderer']['captionTracks']

	track = find_track(tracks, lang)
	# lang_name = track['name']['simpleText']

	timedtext_url = track['baseUrl']
	r = requests.get(timedtext_url)
	doc = lxml.etree.fromstring(r.content)
	for node in doc.xpath('/transcript/text'):
	print(node.text)


	if __name__ == '__main__':
	import sys
	main(*sys.argv[1:])