Skip to content

Instantly share code, notes, and snippets.

@rachmadaniHaryono
Created January 16, 2019 06:10
Show Gist options
  • Save rachmadaniHaryono/96446c5578215c54ab9073399f2f567c to your computer and use it in GitHub Desktop.
Save rachmadaniHaryono/96446c5578215c54ab9073399f2f567c to your computer and use it in GitHub Desktop.
parse youtube watch later
#!/usr/bin/env python
import re
import json
from bs4 import BeautifulSoup
def parse(html_file):
"""parse youtube watch later html source.
>>> html_file = "youtube_wl.html"
>>> parse(html_file)
...
"""
with open(html_file) as f:
soup = BeautifulSoup(f.read(), 'html.parser')
scs = [
str(x.contents[0]) for x in soup.select('script')
if x.contents and 'simpleText' in x.contents[0]]
json_match = re.search(
r'window\["ytInitialData"\]\s+=\s+.+;', scs[0])
json_s = json_match.group(0).strip().split(
'window["ytInitialData"]', 1)[1].split('=', 1)[1].rsplit(';', 1)[0]
js_d = json.loads(json_s)
# js_d['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['stats']
# Out[74]: [{'simpleText': '145 videos'}, {'simpleText': 'Updated today'}]
items_stats = (
js_d['sidebar']['playlistSidebarRenderer']['items'][0]
['playlistSidebarPrimaryInfoRenderer']['stats']
)
items = (
js_d['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]
['tabRenderer']['content']['sectionListRenderer']['contents'][0]
['itemSectionRenderer']['contents'][0]
['playlistVideoListRenderer']['contents']
)
video_ids = [x['playlistVideoRenderer']['videoId'] for x in items]
return {
'json_data': js_d,
'stats': items_stats,
'items': items,
'video_ids': video_ids,
}
def parse_json(json_file):
"""parse youtube watch later json source.
>>> html_file = "youtube_wl.json"
>>> parse(html_file)
...
example for json url:
https://www.youtube.com/browse_ajax?ctoken=<str>&continuation=<str>&itct=<str>
"""
with open(json_file) as f:
js_d = json.load(f)
items = (
js_d[1]['response']['continuationContents']
['playlistVideoListContinuation']['contents']
)
video_ids = [x['playlistVideoRenderer']['videoId'] for x in items]
return {
'json_data': js_d,
'items': items,
'video_ids': video_ids,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment