Last active
November 11, 2017 05:58
-
-
Save P1n3appl3/f44ec566ccb43f3e5d4808d9ad841e4b to your computer and use it in GitHub Desktop.
Youtube comment scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import httplib2 | |
import sys | |
import urllib2 | |
import xml.etree.ElementTree as ET | |
# 1. To get all of this set up, you need python 2.7 and pip | |
# 2. Run "pip install --upgrade google-api-python-client" to get the dependencies | |
# 3. Go to console.cloud.google.com and create a new project | |
# 4. Go to API's and Services, and generate an OAuth client ID. Download the json | |
# it gives you and place it in the same folder as this script. Make sure to name | |
# it "client_secrets.json" | |
# 5 Go to https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest and save the page as a json file | |
# 6. Rename the file "youtube-v3-api-captions.json" and put it in the same folder as this script | |
from apiclient.discovery import build_from_document | |
from apiclient.errors import HttpError | |
from oauth2client.client import flow_from_clientsecrets | |
from oauth2client.file import Storage | |
from oauth2client.tools import argparser, run_flow | |
CLIENT_SECRETS_FILE = "client_secrets.json" | |
YOUTUBE_READ_WRITE_SSL_SCOPE = "https://www.googleapis.com/auth/youtube.force-ssl" | |
YOUTUBE_API_SERVICE_NAME = "youtube" | |
YOUTUBE_API_VERSION = "v3" | |
def get_authenticated_service(args): | |
flow = flow_from_clientsecrets( | |
CLIENT_SECRETS_FILE, scope=YOUTUBE_READ_WRITE_SSL_SCOPE) | |
storage = Storage("%s-oauth2.json" % sys.argv[0]) | |
credentials = storage.get() | |
if credentials is None or credentials.invalid: | |
credentials = run_flow(flow, storage, args) | |
with open("youtube-v3-api-captions.json", "r") as f: | |
doc = f.read() | |
return build_from_document( | |
doc, http=credentials.authorize(httplib2.Http())) | |
def remove_empty_kwargs(**kwargs): | |
good_kwargs = {} | |
if kwargs is not None: | |
for key, value in kwargs.iteritems(): | |
if value: | |
good_kwargs[key] = value | |
return good_kwargs | |
def video_categories_list(client, **kwargs): | |
kwargs = remove_empty_kwargs(**kwargs) | |
response = client.videoCategories().list(**kwargs).execute() | |
results = [] | |
for i in response["items"]: | |
results.append((i["id"], i["snippet"]["title"])) | |
return results | |
def videos_list_most_popular(client, **kwargs): | |
kwargs = remove_empty_kwargs(**kwargs) | |
response = client.videos().list(**kwargs).execute() | |
results = [] | |
for i in response["items"]: | |
results.append((i["id"], i["snippet"]["channelId"])) | |
return results | |
def search(client, **kwargs): | |
kwargs = remove_empty_kwargs(**kwargs) | |
response = client.search().list(**kwargs).execute() | |
results = [] | |
for i in response["items"]: | |
if i["id"]["kind"] == "youtube#video": | |
results.append(i["id"]["videoId"]) | |
return results | |
def check_captions(vidId): | |
site = "http://video.google.com/timedtext?lang=en&v=" | |
response = urllib2.urlopen(site + vidId) | |
raw = response.read() | |
if len(raw) == 0: | |
return False | |
return True | |
def get_trending_vids(client, thoroughness=10): | |
categories = video_categories_list(client, part='snippet', regionCode='US') | |
results = [] | |
for cat in categories + [('', "no category")]: | |
top_results = [] | |
try: | |
top_results = videos_list_most_popular( | |
client, | |
part='snippet', | |
chart='mostPopular', | |
regionCode='US', | |
videoCategoryId=cat[0], | |
maxResults=thoroughness) | |
except: | |
print "Couldn't get videos in category:", cat[1] | |
continue | |
hits = 0 | |
for vid in top_results: | |
if check_captions(vid[0]): | |
results.append(vid[1]) | |
hits += 1 | |
print hits, '/', len(top_results), cat[1] | |
f = open("potential.txt", 'w') | |
for i in results: | |
f.write(i + '\n') | |
f.close() | |
def filter_channels(client, potential, thoroughness=10, threshold=.5): | |
good_channels = [] | |
for channel in potential: | |
vids = search( | |
youtube, | |
part='snippet', | |
maxResults=thoroughness, | |
channelId=channel.strip(), | |
order="date") | |
hits = [check_captions(i) for i in vids].count(True) | |
print hits, '/', len(vids) | |
if float(hits) / len(vids) >= threshold: | |
good_channels.append(channel) | |
print len(good_channels), "of those had a good ammount of captions" | |
f = open("results.txt", 'a') | |
for i in good_channels: | |
f.write(i) | |
f.close() | |
youtube = get_authenticated_service(argparser.parse_args()) | |
get_trending_vids(youtube, 10) | |
f = open("potential.txt") | |
potential = set(f.readlines()) | |
f.close() | |
print "found", len(potential), "potential channels..." | |
filter_channels(youtube, potential, 10, .5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment