Skip to content

Instantly share code, notes, and snippets.

@knzm
Last active November 26, 2020 00:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save knzm/b219fdcff46669211b8e9266e2210bc6 to your computer and use it in GitHub Desktop.
Save knzm/b219fdcff46669211b8e9266e2210bc6 to your computer and use it in GitHub Desktop.
import re
import operator
import json
import requests
import lxml.etree
import lxml.html
def collect_global_vars(doc):
namespace = {}
for s in doc.cssselect('body script'):
if s.text is None:
continue
src = s.text.strip()
m = re.match(r'^window\["', src, re.DOTALL)
if m is None:
continue
chunks = []
it = re.finditer(r'window\[".*?"\]\s*=\s*', src)
indices = [m.start() for m in it]
texts = [src[s:e] for s, e in zip(indices, indices[1:] + [len(src)])]
for text in texts:
m = re.match(r'^window\["(.*?)"\]\s*=\s*', text.rstrip(), re.DOTALL)
name = m.group(1)
offset = m.end()
dec = json.JSONDecoder()
obj, pos = dec.raw_decode(text[offset:])
rest = re.sub(r'^;?\s*', '', text[offset+pos:])
chunks.append((name, obj, rest))
for i, (name, obj, rest) in enumerate(chunks):
if rest != '' and i + 1 != len(chunks):
raise ValueError()
namespace[name] = obj
return namespace
def find_track(tracks, lang):
candidates = []
for track in tracks:
if track['languageCode'] != lang:
continue
is_generated = track.get('kind', '') == 'asr'
candidates.append((is_generated, track))
if len(candidates) == 0:
return None
return sorted(candidates, key=operator.itemgetter(0))[0][1]
def main(video_id, lang='ja'):
params = {
'v': video_id,
}
r = requests.get('https://www.youtube.com/watch', params=params)
doc = lxml.html.fromstring(r.content.decode('utf-8'))
namespace = collect_global_vars(doc)
captions = namespace['ytInitialPlayerResponse']['captions']
tracks = captions['playerCaptionsTracklistRenderer']['captionTracks']
track = find_track(tracks, lang)
# lang_name = track['name']['simpleText']
timedtext_url = track['baseUrl']
r = requests.get(timedtext_url)
doc = lxml.etree.fromstring(r.content)
for node in doc.xpath('/transcript/text'):
print(node.text)
if __name__ == '__main__':
import sys
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment