def category_extractor(data): | |
i_d = [data['items'][i]['id'] for i in range(len(data['items']))] | |
title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))] | |
i_d = list(map(int, i_d)) | |
category = zip(i_d, title) | |
category = dict(category) | |
return category | |
#create a new category column by mapping the category names to their id | |
df1['category_title'] = df1['category_id'].map(category_extractor(data1)) | |
df2['category_title'] = df2['category_id'].map(category_extractor(data2)) | |
df3['category_title'] = df3['category_id'].map(category_extractor(data3)) | |
#join the dataframes | |
df = pd.concat([df1, df2, df3], ignore_index=True) | |
#drop rows based on duplicate videos | |
df = df.drop_duplicates('video_id') | |
#collect only titles of entertainment videos | |
#feel free to use any category of video that you want | |
entertainment = df[df['category_title'] == 'Entertainment']['title'] | |
entertainment = entertainment.tolist() | |
#remove punctuations and convert text to lowercase | |
def clean_text(text): | |
text = ''.join(e for e in text if e not in string.punctuation).lower() | |
text = text.encode('utf8').decode('ascii', 'ignore') | |
return text | |
corpus = [clean_text(e) for e in entertainment] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment