Skip to content

Instantly share code, notes, and snippets.

@robert8138
Last active June 10, 2020 18:50
Show Gist options
  • Save robert8138/aa0e08bd56b6597f278d2622397a2505 to your computer and use it in GitHub Desktop.
Save robert8138/aa0e08bd56b6597f278d2622397a2505 to your computer and use it in GitHub Desktop.
Use YouTube API to get view trends of popular MOOC course
import os
import pandas as pd
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
import matplotlib
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "YOUR_CLIENT_SECRET_FILE.json"
# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
api_service_name, api_version, credentials=credentials)
course_dict = {
"Stanford CS 230": "PLoROMvodv4rOABXSygHTsbvUz4G_YQhOb",
"Stanford CS 224N": "PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z",
"Stanford CS 231N": "PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk",
"Stanford CS 229": "PLa-Bt050gYuhEeLRG8YBmFxwLvTJ5FqPS",
"fast.ai Course": "PLCdvEQLhYkYmKTKWTrH7bHtQ1CsKZaQBl",
"MIT Linear Algebra": "PL49CF3715CB9EF31D",
"Caltech Learning from Data": "PLD63A284B7615313A",
"How to Start a Startup": "PL11qn6zM2Y3bMZdChxEqHKaCaKUjwItGL",
# really any playlist from Youtube
}
def get_view_ts(course_name, api_client):
"""
For a given channel_id, retrive the list of videos and
store the video title and video id in a dataframe.
return: pandas.DataFrame()
"""
request = youtube.playlistItems().list(
part="snippet",
maxResults=50,
playlistId=course_dict.get(course_name)
)
response = request.execute()
df_video_ids = pd.DataFrame(
columns = ['course_name', 'title', 'video_id']
)
for r in response.get('items'):
data = r.get('snippet')
title = data.get('title')
video_id = data.get('resourceId').get('videoId')
df_video_ids = df_video_ids.append(
{'course_name': course_name, 'title' : title, 'video_id': video_id},
ignore_index=True
)
df_views = pd.DataFrame(columns = ['video_id', 'views'])
for video_id in df_video_ids.video_id:
request = youtube.videos().list(
part="statistics",
id=video_id
)
response = request.execute()
view_count = (response.get('items')[0]
.get('statistics')
.get('viewCount')
)
df_views = df_views.append(
{'video_id' : video_id, 'views': float(view_count)},
ignore_index=True
)
df = df_video_ids.merge(df_views, on='video_id')
df['view_ratio'] = df.views / df.views[0]
return df
df_result = pd.DataFrame(columns = ['course_name', 'title', 'views'])
for course_name in course_dict.keys():
df_per_course = get_view_ts(course_name, youtube)
df_result = df_result.append(
df_per_course["course_name", "title", "views"]
)
df_pivot = (
df_result
.loc[:, ['course_name', 'views']]
.pivot(columns='course_name', values='views')
)
ax = (
df_pivot.plot(
title = 'View Trends Overtime',
kind='line',
figsize=[15,10]
)
)
ax.set_xlabel("Video # in the YoutTube Playlist", size=14)
ax.set_ylabel("% Views relative to First Video", size=14)
vals = ax.get_yticks()
_ = ax.set_yticklabels(['{:,.1%}'.format(x) for x in vals])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment