shushiej/pafy_jre_yt.py

## pafy_jre_yt.py
import pandas as pd
import pafy

# Method to get the total minutes from the Duration Column
def convert_to_min(x):
    splits = str(x).split(":")
    if len(splits) == 3:
        hr = int(splits[0]) * 60
        total = hr + int(splits[1])
    else:
        total = int(splits[0])

    return total

 # Use string manipulation to get the Guest Name out of the Title
def get_guest(x):
    if("-" in str(x)):
        guest = str(x).split("-")

        return guest[1].strip()
    elif("with" in str(x)):
        guest = str(x).split("with")

        return guest[1].strip()
    else:
        return x

pafy.set_api_key("YOUR_API_KEY")

all_jre_playlists= ["https://www.youtube.com/playlist?list=PLk1Sqn_f33Kt_vFRd3OyzqQ78Go4c1fyn", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuQyLE4RjEOdJ_-0epbcBb4", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KvXucAFMo5Tc5p8e_mcc-5g", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtYIPnFjpI19BCz2unzWYlJ", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Kvv8T6ZESpJ2nvEHT9xBhlb",
                   "https://www.youtube.com/playlist?list=PLk1Sqn_f33KvtMA4mCQSnzGsZe8qsTdzV", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Ku0Oa3t8MQjV7D_G_PBi8g1", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuU_aJDvMPPAy_SoxXTt_ub", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtVQWWnE_V6-sypm5zUMkU6"]

all_jre_info = []
# Iterate through each playlist and add each item to a list
for plurl in all_jre_playlists:
    playlist = pafy.get_playlist(plurl)
    for i in playlist['items']:
        all_jre_info.append(i)

len(all_jre_info)
# 1325

yt_jre = pd.DataFrame.from_dict(all_jre_info)

# Explode the MetaData from pafy api into the DataFrame
yt_jre = yt_jre['playlist_meta'].apply(pd.Series)

# Set the timestamp to seconds
yt_jre['timestamp_created'] = pd.to_datetime(yt_jre['time_created'], unit='s')

# Convert all the time data into a DatetimeIndex and extract the Year, Month and Day into its own columns.
yt_jre['year'] = pd.DatetimeIndex(yt_jre['timestamp_created']).year
yt_jre['month']=pd.DatetimeIndex(yt_jre['timestamp_created']).month
yt_jre['day'] = pd.DatetimeIndex(yt_jre['timestamp_created']).day

# Use Regex to extract the episode number from the Title
yt_jre['Episode'] = yt_jre['title'].str.extract('#(\d*)', expand=True)
yt_jre['guest'] = yt_jre['title'].apply(lambda x: get_guest(x))

# Convert the duration into an integer of minutes
yt_jre['duration_minutes'] = yt_jre['duration'].apply(lambda x: convert_to_min(x))

# Probably could have used regex here.
yt_jre['views_raw'] = yt_jre['views'].str.replace(",", "")
yt_jre['views_raw'] = yt_jre['views_raw'].astype(int)

# Some basic stats
yt_jre['views_raw'].mean()
# 1323460.4403323263
yt_jre['duration_minutes'].mean()
# 150.0725075528701
yt_jre['likes'].mean()
# 16713.877643504533
yt_jre['dislikes'].mean()
# 1566.5430513595165
	import pandas as pd
	import pafy

	# Method to get the total minutes from the Duration Column
	def convert_to_min(x):
	splits = str(x).split(":")
	if len(splits) == 3:
	hr = int(splits[0]) * 60
	total = hr + int(splits[1])
	else:
	total = int(splits[0])

	return total

	# Use string manipulation to get the Guest Name out of the Title
	def get_guest(x):
	if("-" in str(x)):
	guest = str(x).split("-")

	return guest[1].strip()
	elif("with" in str(x)):
	guest = str(x).split("with")

	return guest[1].strip()
	else:
	return x

	pafy.set_api_key("YOUR_API_KEY")

	all_jre_playlists= ["https://www.youtube.com/playlist?list=PLk1Sqn_f33Kt_vFRd3OyzqQ78Go4c1fyn", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuQyLE4RjEOdJ_-0epbcBb4", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KvXucAFMo5Tc5p8e_mcc-5g", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtYIPnFjpI19BCz2unzWYlJ", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Kvv8T6ZESpJ2nvEHT9xBhlb",
	"https://www.youtube.com/playlist?list=PLk1Sqn_f33KvtMA4mCQSnzGsZe8qsTdzV", "https://www.youtube.com/playlist?list=PLk1Sqn_f33Ku0Oa3t8MQjV7D_G_PBi8g1", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuU_aJDvMPPAy_SoxXTt_ub", "https://www.youtube.com/playlist?list=PLk1Sqn_f33KtVQWWnE_V6-sypm5zUMkU6"]

	all_jre_info = []
	# Iterate through each playlist and add each item to a list
	for plurl in all_jre_playlists:
	playlist = pafy.get_playlist(plurl)
	for i in playlist['items']:
	all_jre_info.append(i)

	len(all_jre_info)
	# 1325

	yt_jre = pd.DataFrame.from_dict(all_jre_info)

	# Explode the MetaData from pafy api into the DataFrame
	yt_jre = yt_jre['playlist_meta'].apply(pd.Series)

	# Set the timestamp to seconds
	yt_jre['timestamp_created'] = pd.to_datetime(yt_jre['time_created'], unit='s')

	# Convert all the time data into a DatetimeIndex and extract the Year, Month and Day into its own columns.
	yt_jre['year'] = pd.DatetimeIndex(yt_jre['timestamp_created']).year
	yt_jre['month']=pd.DatetimeIndex(yt_jre['timestamp_created']).month
	yt_jre['day'] = pd.DatetimeIndex(yt_jre['timestamp_created']).day

	# Use Regex to extract the episode number from the Title
	yt_jre['Episode'] = yt_jre['title'].str.extract('#(\d*)', expand=True)
	yt_jre['guest'] = yt_jre['title'].apply(lambda x: get_guest(x))

	# Convert the duration into an integer of minutes
	yt_jre['duration_minutes'] = yt_jre['duration'].apply(lambda x: convert_to_min(x))

	# Probably could have used regex here.
	yt_jre['views_raw'] = yt_jre['views'].str.replace(",", "")
	yt_jre['views_raw'] = yt_jre['views_raw'].astype(int)

	# Some basic stats
	yt_jre['views_raw'].mean()
	# 1323460.4403323263
	yt_jre['duration_minutes'].mean()
	# 150.0725075528701
	yt_jre['likes'].mean()
	# 16713.877643504533
	yt_jre['dislikes'].mean()
	# 1566.5430513595165