Skip to content

Instantly share code, notes, and snippets.

@shushiej
Last active December 5, 2020 04:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shushiej/ee0bcea780adda7ccd9118fd0d3c5e5c to your computer and use it in GitHub Desktop.
Save shushiej/ee0bcea780adda7ccd9118fd0d3c5e5c to your computer and use it in GitHub Desktop.
Extracting JRE YouTube data
import pandas as pd
import pafy
# Method to get the total minutes from the Duration Column
def convert_to_min(x):
splits = str(x).split(":")
if len(splits) == 3:
hr = int(splits[0]) * 60
total = hr + int(splits[1])
else:
total = int(splits[0])
return total
# Use string manipulation to get the Guest Name out of the Title
def get_guest(x):
if("-" in str(x)):
guest = str(x).split("-")
return guest[1].strip()
elif("with" in str(x)):
guest = str(x).split("with")
return guest[1].strip()
else:
return x
pafy.set_api_key("YOUR_API_KEY")
all_jre_playlists= ["https://www.youtube.com/playlist?list=PLk1Sqn_f33Kt_vFRd3OyzqQ78Go4c1fyn", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuQyLE4RjEOdJ_-0epbcBb4", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KvXucAFMo5Tc5p8e_mcc-5g", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtYIPnFjpI19BCz2unzWYlJ", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Kvv8T6ZESpJ2nvEHT9xBhlb",
"https://www.youtube.com/playlist?list=PLk1Sqn_f33KvtMA4mCQSnzGsZe8qsTdzV", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Ku0Oa3t8MQjV7D_G_PBi8g1", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuU_aJDvMPPAy_SoxXTt_ub", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtVQWWnE_V6-sypm5zUMkU6"]
all_jre_info = []
# Iterate through each playlist and add each item to a list
for plurl in all_jre_playlists:
playlist = pafy.get_playlist(plurl)
for i in playlist['items']:
all_jre_info.append(i)
len(all_jre_info)
# 1325
yt_jre = pd.DataFrame.from_dict(all_jre_info)
# Explode the MetaData from pafy api into the DataFrame
yt_jre = yt_jre['playlist_meta'].apply(pd.Series)
# Set the timestamp to seconds
yt_jre['timestamp_created'] = pd.to_datetime(yt_jre['time_created'], unit='s')
# Convert all the time data into a DatetimeIndex and extract the Year, Month and Day into its own columns.
yt_jre['year'] = pd.DatetimeIndex(yt_jre['timestamp_created']).year
yt_jre['month']=pd.DatetimeIndex(yt_jre['timestamp_created']).month
yt_jre['day'] = pd.DatetimeIndex(yt_jre['timestamp_created']).day
# Use Regex to extract the episode number from the Title
yt_jre['Episode'] = yt_jre['title'].str.extract('#(\d*)', expand=True)
yt_jre['guest'] = yt_jre['title'].apply(lambda x: get_guest(x))
# Convert the duration into an integer of minutes
yt_jre['duration_minutes'] = yt_jre['duration'].apply(lambda x: convert_to_min(x))
# Probably could have used regex here.
yt_jre['views_raw'] = yt_jre['views'].str.replace(",", "")
yt_jre['views_raw'] = yt_jre['views_raw'].astype(int)
# Some basic stats
yt_jre['views_raw'].mean()
# 1323460.4403323263
yt_jre['duration_minutes'].mean()
# 150.0725075528701
yt_jre['likes'].mean()
# 16713.877643504533
yt_jre['dislikes'].mean()
# 1566.5430513595165
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment