Created October 5, 2020 03:48
def category_extractor(data):
i_d = [data['items'][i]['id'] for i in range(len(data['items']))]
title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))]
i_d = list(map(int, i_d))
category = zip(i_d, title)
category = dict(category)
return category
#create a new category column by mapping the category names to their id
df1['category_title'] = df1['category_id'].map(category_extractor(data1))
df2['category_title'] = df2['category_id'].map(category_extractor(data2))
df3['category_title'] = df3['category_id'].map(category_extractor(data3))
#join the dataframes
df = pd.concat([df1, df2, df3], ignore_index=True)
#drop rows based on duplicate videos
df = df.drop_duplicates('video_id')
#collect only titles of entertainment videos
#feel free to use any category of video that you want
entertainment = df[df['category_title'] == 'Entertainment']['title']
entertainment = entertainment.tolist()
#remove punctuations and convert text to lowercase
def clean_text(text):
text = ''.join(e for e in text if e not in string.punctuation).lower()
text = text.encode('utf8').decode('ascii', 'ignore')
return text
corpus = [clean_text(e) for e in entertainment]
