Skip to content

Instantly share code, notes, and snippets.

@P1n3appl3
Last active November 11, 2017 05:58
Show Gist options
  • Save P1n3appl3/f44ec566ccb43f3e5d4808d9ad841e4b to your computer and use it in GitHub Desktop.
Save P1n3appl3/f44ec566ccb43f3e5d4808d9ad841e4b to your computer and use it in GitHub Desktop.
Youtube comment scraper
import httplib2
import sys
import urllib2
import xml.etree.ElementTree as ET
# 1. To get all of this set up, you need python 2.7 and pip
# 2. Run "pip install --upgrade google-api-python-client" to get the dependencies
# 3. Go to console.cloud.google.com and create a new project
# 4. Go to API's and Services, and generate an OAuth client ID. Download the json
# it gives you and place it in the same folder as this script. Make sure to name
# it "client_secrets.json"
# 5 Go to https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest and save the page as a json file
# 6. Rename the file "youtube-v3-api-captions.json" and put it in the same folder as this script
from apiclient.discovery import build_from_document
from apiclient.errors import HttpError
from oauth2client.client import flow_from_clientsecrets
from oauth2client.file import Storage
from oauth2client.tools import argparser, run_flow
CLIENT_SECRETS_FILE = "client_secrets.json"
YOUTUBE_READ_WRITE_SSL_SCOPE = "https://www.googleapis.com/auth/youtube.force-ssl"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
def get_authenticated_service(args):
flow = flow_from_clientsecrets(
CLIENT_SECRETS_FILE, scope=YOUTUBE_READ_WRITE_SSL_SCOPE)
storage = Storage("%s-oauth2.json" % sys.argv[0])
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = run_flow(flow, storage, args)
with open("youtube-v3-api-captions.json", "r") as f:
doc = f.read()
return build_from_document(
doc, http=credentials.authorize(httplib2.Http()))
def remove_empty_kwargs(**kwargs):
good_kwargs = {}
if kwargs is not None:
for key, value in kwargs.iteritems():
if value:
good_kwargs[key] = value
return good_kwargs
def video_categories_list(client, **kwargs):
kwargs = remove_empty_kwargs(**kwargs)
response = client.videoCategories().list(**kwargs).execute()
results = []
for i in response["items"]:
results.append((i["id"], i["snippet"]["title"]))
return results
def videos_list_most_popular(client, **kwargs):
kwargs = remove_empty_kwargs(**kwargs)
response = client.videos().list(**kwargs).execute()
results = []
for i in response["items"]:
results.append((i["id"], i["snippet"]["channelId"]))
return results
def search(client, **kwargs):
kwargs = remove_empty_kwargs(**kwargs)
response = client.search().list(**kwargs).execute()
results = []
for i in response["items"]:
if i["id"]["kind"] == "youtube#video":
results.append(i["id"]["videoId"])
return results
def check_captions(vidId):
site = "http://video.google.com/timedtext?lang=en&v="
response = urllib2.urlopen(site + vidId)
raw = response.read()
if len(raw) == 0:
return False
return True
def get_trending_vids(client, thoroughness=10):
categories = video_categories_list(client, part='snippet', regionCode='US')
results = []
for cat in categories + [('', "no category")]:
top_results = []
try:
top_results = videos_list_most_popular(
client,
part='snippet',
chart='mostPopular',
regionCode='US',
videoCategoryId=cat[0],
maxResults=thoroughness)
except:
print "Couldn't get videos in category:", cat[1]
continue
hits = 0
for vid in top_results:
if check_captions(vid[0]):
results.append(vid[1])
hits += 1
print hits, '/', len(top_results), cat[1]
f = open("potential.txt", 'w')
for i in results:
f.write(i + '\n')
f.close()
def filter_channels(client, potential, thoroughness=10, threshold=.5):
good_channels = []
for channel in potential:
vids = search(
youtube,
part='snippet',
maxResults=thoroughness,
channelId=channel.strip(),
order="date")
hits = [check_captions(i) for i in vids].count(True)
print hits, '/', len(vids)
if float(hits) / len(vids) >= threshold:
good_channels.append(channel)
print len(good_channels), "of those had a good ammount of captions"
f = open("results.txt", 'a')
for i in good_channels:
f.write(i)
f.close()
youtube = get_authenticated_service(argparser.parse_args())
get_trending_vids(youtube, 10)
f = open("potential.txt")
potential = set(f.readlines())
f.close()
print "found", len(potential), "potential channels..."
filter_channels(youtube, potential, 10, .5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment