Last active
January 12, 2022 13:45
-
-
Save egpbos/19b8d337cb1922cd639441b72db7624a to your computer and use it in GitHub Desktop.
Use tf-idf to give weights to Google API video labels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
data = pd.read_csv('google_api_frame_dec6.csv') | |
# We can look at unique labels for each video, or take into account that | |
# labels may occur multiple times in a single video (in different frames | |
# for instance). In this dataset, there are no separate frames, the labels | |
# are already unique, so this will work. In case there is per frame data | |
# for each video, groupby+unique should be replaced by something like | |
# [data[data.VideoID == id].Entity.values for id in data.VideoID.unique()] | |
unique_labels_per_video = data.groupby('VideoID').Entity.unique() | |
corpus = {video_id: ' '.join([label.replace(' ', '_') for label in labels]) | |
for video_id, labels in unique_labels_per_video.iteritems()} | |
vectorizer = TfidfVectorizer() | |
tfidf_values = vectorizer.fit_transform(corpus.values()) | |
tfidf_feature_names = vectorizer.get_feature_names() | |
df_tfidf = pd.DataFrame(data=tfidf_values.toarray(), columns=tfidf_feature_names, index=corpus.keys()) | |
# Check whether indeed the same labels are present for each video (for the | |
# data DF we just take all the label (Entity) values (applying the space to | |
# underscore transformation we did to the other labels) for the video and for | |
# the tf-idf weighted DF we look at all the labels with larger than 0 values): | |
data_labels = [label.replace(' ', '_') for label in data[data.VideoID == 'v100_15'].Entity.values] | |
assert sorted(data_labels) == sorted(df_tfidf.loc['v100_15'][df_tfidf.loc['v100_15'] > 0].index.values) | |
# Save to csv | |
df_tfidf.to_csv("google_api_frame_dec6_tfidf.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment