Last active
May 22, 2018 22:05
-
-
Save aadibajpai/c524da5a938f86a15cf57a9327598dd7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import time | |
import json | |
import requests | |
import argparse | |
import lxml.html | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from lxml.cssselect import CSSSelector | |
import re | |
def findWholeWord(w): | |
return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search | |
YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}' | |
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax' | |
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' | |
def find_value(html, key, num_chars=2): | |
pos_begin = html.find(key) + len(key) + num_chars | |
pos_end = html.find('"', pos_begin) | |
return html[pos_begin: pos_end] | |
def extract_comments(html): | |
tree = lxml.html.fromstring(html) | |
item_sel = CSSSelector('.comment-item') | |
text_sel = CSSSelector('.comment-text-content') | |
time_sel = CSSSelector('.time') | |
author_sel = CSSSelector('.user-name') | |
for item in item_sel(tree): | |
yield {'cid': item.get('data-cid'), | |
'text': text_sel(item)[0].text_content(), | |
'time': time_sel(item)[0].text_content().strip(), | |
'author': author_sel(item)[0].text_content()} | |
def extract_reply_cids(html): | |
tree = lxml.html.fromstring(html) | |
sel = CSSSelector('.comment-replies-header > .load-comments') | |
return [i.get('data-cid') for i in sel(tree)] | |
def ajax_request(session, url, params, data, retries=10, sleep=20): | |
for _ in range(retries): | |
response = session.post(url, params=params, data=data) | |
if response.status_code == 200: | |
response_dict = json.loads(response.text) | |
return response_dict.get('page_token', None), response_dict['html_content'] | |
else: | |
time.sleep(sleep) | |
def download_comments(youtube_id, sleep=1): | |
session = requests.Session() | |
session.headers['User-Agent'] = USER_AGENT | |
# Get Youtube page with initial comments | |
response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id)) | |
html = response.text | |
reply_cids = extract_reply_cids(html) | |
ret_cids = [] | |
for comment in extract_comments(html): | |
ret_cids.append(comment['cid']) | |
yield comment | |
page_token = find_value(html, 'data-token') | |
session_token = find_value(html, 'XSRF_TOKEN', 4) | |
first_iteration = True | |
# Get remaining comments (the same as pressing the 'Show more' button) | |
while page_token: | |
data = {'video_id': youtube_id, | |
'session_token': session_token} | |
params = {'action_load_comments': 1, | |
'order_by_time': True, | |
'filter': youtube_id} | |
if first_iteration: | |
params['order_menu'] = True | |
else: | |
data['page_token'] = page_token | |
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | |
if not response: | |
break | |
page_token, html = response | |
reply_cids += extract_reply_cids(html) | |
for comment in extract_comments(html): | |
if comment['cid'] not in ret_cids: | |
ret_cids.append(comment['cid']) | |
yield comment | |
first_iteration = False | |
time.sleep(sleep) | |
# Get replies (the same as pressing the 'View all X replies' link) | |
for cid in reply_cids: | |
data = {'comment_id': cid, | |
'video_id': youtube_id, | |
'can_reply': 1, | |
'session_token': session_token} | |
params = {'action_load_replies': 1, | |
'order_by_time': True, | |
'filter': youtube_id, | |
'tab': 'inbox'} | |
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) | |
if not response: | |
break | |
_, html = response | |
for comment in extract_comments(html): | |
if comment['cid'] not in ret_cids: | |
ret_cids.append(comment['cid']) | |
yield comment | |
time.sleep(sleep) | |
def main(argv): | |
parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API')) | |
parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit') | |
parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments') | |
parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)') | |
parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments') | |
try: | |
args = parser.parse_args(argv) | |
youtube_id = args.youtubeid | |
output = args.output | |
limit = args.limit | |
if not youtube_id or not output: | |
parser.print_usage() | |
raise ValueError('you need to specify a Youtube ID and an output filename') | |
print('Downloading Youtube comments for video:', youtube_id) | |
count = 0 | |
A = 0 | |
B = 0 | |
C = 0 | |
D = 0 | |
E = 0 | |
for comment in download_comments(youtube_id): | |
text = comment['text'].encode('ascii', 'ignore').decode('utf-8') | |
if findWholeWord('A')(text) is not None: | |
A+=1 | |
if findWholeWord('B')(text) is not None: | |
B+=1 | |
if findWholeWord('C')(text) is not None: | |
C+=1 | |
if findWholeWord('D')(text) is not None: | |
D+=1 | |
if findWholeWord('E')(text) is not None: | |
E+=1 | |
count += 1 | |
sys.stdout.write('Downloaded %d comment(s)\r' % count) | |
sys.stdout.flush() | |
if limit and count >= limit: | |
break | |
print('\nDone!\n') | |
print(A, B, C, D, E) # 4351 7137 7692 2574 6425 when I did it | |
except Exception as e: | |
print('Error:', str(e)) | |
sys.exit(1) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment