Created
April 5, 2018 02:30
-
-
Save tamr/515d6f6414cfc0dbaa572073ff0d623a to your computer and use it in GitHub Desktop.
yt comments scrapper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
import os | |
import sys | |
import time | |
import json | |
import requests | |
import argparse | |
import lxml.html | |
from lxml.cssselect import CSSSelector | |
YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}' | |
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax' | |
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' | |
def find_value(html, key, num_chars=2): | |
pos_begin = html.find(key) + len(key) + num_chars | |
pos_end = html.find('"', pos_begin) | |
return html[pos_begin: pos_end] | |
def extract_comments(html): | |
tree = lxml.html.fromstring(html) | |
item_sel = CSSSelector('.comment-item') | |
text_sel = CSSSelector('.comment-text-content') | |
time_sel = CSSSelector('.time') | |
author_sel = CSSSelector('.user-name') | |
for item in item_sel(tree): | |
yield {'cid': item.get('data-cid'), | |
'text': text_sel(item)[0].text_content(), | |
'time': time_sel(item)[0].text_content().strip(), | |
'author': author_sel(item)[0].text_content()} | |
def extract_reply_cids(html): | |
tree = lxml.html.fromstring(html) | |
sel = CSSSelector('.comment-replies-header > .load-comments') | |
return [i.get('data-cid') for i in sel(tree)] | |
def ajax_request(session, url, params, data, retries=10, sleep=20): | |
for _ in range(retries): | |
response = session.post(url, params=params, data=data) | |
if response.status_code == 200: | |
response_dict = json.loads(response.text) | |
return response_dict.get('page_token', None), response_dict['html_content'] | |
else: | |
time.sleep(sleep) | |
def download_comments(youtube_id, sleep=1): | |
session = requests.Session() | |
session.headers['User-Agent'] = USER_AGENT | |
# Get Youtube page with initial comments | |
response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id)) | |
html = response.text | |
reply_cids = extract_reply_cids(html) | |
ret_cids = [] | |
for comment in extract_comments(html): | |
ret_cids.append(comment['cid']) | |
yield comment | |
page_token = find_value(html, 'data-token') | |
session_token = find_value(html, 'XSRF_TOKEN', 4) | |
first_iteration = True | |
# Get remaining comments (the same as pressing the 'Show more' button) | |
while page_token: | |
data = {'video_id': youtube_id, | |
'session_token': session_token} | |
params = {'action_load_comments': 1, | |
'order_by_time': True, | |
'filter': youtube_id} | |
if first_iteration: | |
params['order_menu'] = True | |
else: | |
data['page_token'] = page_token | |
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | |
if not response: | |
break | |
page_token, html = response | |
reply_cids += extract_reply_cids(html) | |
for comment in extract_comments(html): | |
if comment['cid'] not in ret_cids: | |
ret_cids.append(comment['cid']) | |
yield comment | |
first_iteration = False | |
time.sleep(sleep) | |
# Get replies (the same as pressing the 'View all X replies' link) | |
for cid in reply_cids: | |
data = {'comment_id': cid, | |
'video_id': youtube_id, | |
'can_reply': 1, | |
'session_token': session_token} | |
params = {'action_load_replies': 1, | |
'order_by_time': True, | |
'filter': youtube_id, | |
'tab': 'inbox'} | |
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | |
if not response: | |
break | |
_, html = response | |
for comment in extract_comments(html): | |
if comment['cid'] not in ret_cids: | |
ret_cids.append(comment['cid']) | |
yield comment | |
time.sleep(sleep) | |
def main(argv): | |
parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API')) | |
parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit') | |
parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments') | |
parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)') | |
parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments') | |
try: | |
args = parser.parse_args(argv) | |
youtube_id = args.youtubeid | |
output = args.output | |
limit = args.limit | |
if not youtube_id or not output: | |
parser.print_usage() | |
raise ValueError('you need to specify a Youtube ID and an output filename') | |
print('Downloading Youtube comments for video:', youtube_id) | |
count = 0 | |
with open(output, 'w') as fp: | |
for comment in download_comments(youtube_id): | |
print(json.dumps(comment), file=fp) | |
count += 1 | |
sys.stdout.write('Downloaded %d comment(s)\r' % count) | |
sys.stdout.flush() | |
if limit and count >= limit: | |
break | |
print('\nDone!') | |
except Exception as e: | |
print('Error:', str(e)) | |
sys.exit(1) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment