Skip to content

Instantly share code, notes, and snippets.

@aadibajpai
Last active May 22, 2018 22:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aadibajpai/c524da5a938f86a15cf57a9327598dd7 to your computer and use it in GitHub Desktop.
Save aadibajpai/c524da5a938f86a15cf57a9327598dd7 to your computer and use it in GitHub Desktop.
import os
import sys
import time
import json
import requests
import argparse
import lxml.html
import matplotlib.pyplot as plt
import numpy as np
from lxml.cssselect import CSSSelector
import re
def findWholeWord(w):
return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
def find_value(html, key, num_chars=2):
pos_begin = html.find(key) + len(key) + num_chars
pos_end = html.find('"', pos_begin)
return html[pos_begin: pos_end]
def extract_comments(html):
tree = lxml.html.fromstring(html)
item_sel = CSSSelector('.comment-item')
text_sel = CSSSelector('.comment-text-content')
time_sel = CSSSelector('.time')
author_sel = CSSSelector('.user-name')
for item in item_sel(tree):
yield {'cid': item.get('data-cid'),
'text': text_sel(item)[0].text_content(),
'time': time_sel(item)[0].text_content().strip(),
'author': author_sel(item)[0].text_content()}
def extract_reply_cids(html):
tree = lxml.html.fromstring(html)
sel = CSSSelector('.comment-replies-header > .load-comments')
return [i.get('data-cid') for i in sel(tree)]
def ajax_request(session, url, params, data, retries=10, sleep=20):
for _ in range(retries):
response = session.post(url, params=params, data=data)
if response.status_code == 200:
response_dict = json.loads(response.text)
return response_dict.get('page_token', None), response_dict['html_content']
else:
time.sleep(sleep)
def download_comments(youtube_id, sleep=1):
session = requests.Session()
session.headers['User-Agent'] = USER_AGENT
# Get Youtube page with initial comments
response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
html = response.text
reply_cids = extract_reply_cids(html)
ret_cids = []
for comment in extract_comments(html):
ret_cids.append(comment['cid'])
yield comment
page_token = find_value(html, 'data-token')
session_token = find_value(html, 'XSRF_TOKEN', 4)
first_iteration = True
# Get remaining comments (the same as pressing the 'Show more' button)
while page_token:
data = {'video_id': youtube_id,
'session_token': session_token}
params = {'action_load_comments': 1,
'order_by_time': True,
'filter': youtube_id}
if first_iteration:
params['order_menu'] = True
else:
data['page_token'] = page_token
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
if not response:
break
page_token, html = response
reply_cids += extract_reply_cids(html)
for comment in extract_comments(html):
if comment['cid'] not in ret_cids:
ret_cids.append(comment['cid'])
yield comment
first_iteration = False
time.sleep(sleep)
# Get replies (the same as pressing the 'View all X replies' link)
for cid in reply_cids:
data = {'comment_id': cid,
'video_id': youtube_id,
'can_reply': 1,
'session_token': session_token}
params = {'action_load_replies': 1,
'order_by_time': True,
'filter': youtube_id,
'tab': 'inbox'}
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
if not response:
break
_, html = response
for comment in extract_comments(html):
if comment['cid'] not in ret_cids:
ret_cids.append(comment['cid'])
yield comment
time.sleep(sleep)
def main(argv):
parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
parser.add_argument('--limit', '-l', type=int, help='Limit the number of comments')
try:
args = parser.parse_args(argv)
youtube_id = args.youtubeid
output = args.output
limit = args.limit
if not youtube_id or not output:
parser.print_usage()
raise ValueError('you need to specify a Youtube ID and an output filename')
print('Downloading Youtube comments for video:', youtube_id)
count = 0
A = 0
B = 0
C = 0
D = 0
E = 0
for comment in download_comments(youtube_id):
text = comment['text'].encode('ascii', 'ignore').decode('utf-8')
if findWholeWord('A')(text) is not None:
A+=1
if findWholeWord('B')(text) is not None:
B+=1
if findWholeWord('C')(text) is not None:
C+=1
if findWholeWord('D')(text) is not None:
D+=1
if findWholeWord('E')(text) is not None:
E+=1
count += 1
sys.stdout.write('Downloaded %d comment(s)\r' % count)
sys.stdout.flush()
if limit and count >= limit:
break
print('\nDone!\n')
print(A, B, C, D, E) # 4351 7137 7692 2574 6425 when I did it
except Exception as e:
print('Error:', str(e))
sys.exit(1)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment