Skip to content

Instantly share code, notes, and snippets.

@katipogluMustafa
Created November 15, 2020 22:54
Show Gist options
  • Save katipogluMustafa/4bd74bb920bab749082bcfd9a1846d2e to your computer and use it in GitHub Desktop.
Save katipogluMustafa/4bd74bb920bab749082bcfd9a1846d2e to your computer and use it in GitHub Desktop.
class NetflixDataset(Dataset):
def load_ratings(self):
if not Dataset.is_valid_input_file(self.ratings_file_path):
raise InvalidDatasetInputFilePath
unstructured_ratings = self.__read_ratings_data_from_file()
ratings = NetflixDataset.__structure_ratings_dataframe(unstructured_ratings)
ratings = NetflixDataset.__reduce_dataset_size_by_removing_low_active_user_data(ratings)
Dataset.sort_ratings_by_timestamp(ratings)
return ratings
def __read_ratings_data_from_file(self):
return pd.read_csv(self.ratings_file_path, header=None, names=self.ratings_column_names, usecols=[0, 1, 2])
def __structure_ratings_dataframe(ratings_raw):
NetflixDataset.__convert_ratings_rating_column_format_to_float(ratings_raw)
ratings = NetflixDataset.__convert_netflix_ratings_format_to_standard_movielens_like_format(ratings_raw)
NetflixDataset.__convert_ratings_column_order_similar_to_movielens(ratings)
NetflixDataset.__convert_ratings_timestamp_format_to_datetime(ratings)
NetflixDataset.__convert_ratings_user_id_format_to_int(ratings)
return ratings
def __convert_ratings_rating_column_format_to_float(ratings):
ratings['rating'] = ratings['rating'].astype(float)
def __convert_netflix_ratings_format_to_standard_movielens_like_format(ratings_raw):
temp_movies = ratings_raw[ratings_raw['rating'].isna()]['user_id'].reset_index()
movie_indexes = [[index, int(movie[:-1])] for index, movie in temp_movies.values]
shifted_movie_indexes = deque(movie_indexes)
shifted_movie_indexes.rotate(-1)
user_data = []
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indexes, shifted_movie_indexes):
if df_id_1 < df_id_2:
temp_df = ratings_raw[(df_id_1 + 1):(df_id_2 - 1)].copy()
else:
temp_df = ratings_raw[df_id_1 + 1:].copy()
temp_df['item_id'] = movie_id
user_data.append(temp_df)
return pd.concat(user_data)
def __convert_ratings_user_id_format_to_int(ratings):
ratings['user_id'] = ratings['user_id'].astype(int)
def __convert_ratings_timestamp_format_to_datetime(ratings):
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], infer_datetime_format=True)
def __convert_ratings_column_order_similar_to_movielens(ratings):
ratings.reindex(columns=['user_id', 'item_id', 'rating', 'timestamp'])
def __reduce_dataset_size_by_removing_low_active_user_data(ratings):
netflix_users = NetflixDataset.__get_high_active_user_list(ratings, 40)
ratings = NetflixDataset.__drop_low_active_user_ratings_only_keep_high_ones(netflix_users, ratings)
return ratings
def __get_high_active_user_list(ratings, min_ratings_count_for_being_high_active):
data = ratings.copy(deep=True)
users = pd.DataFrame(data.groupby('user_id')['rating'].mean())
users['No_of_ratings'] = pd.DataFrame(data.groupby('user_id')['rating'].count())
users.sort_values(by=['No_of_ratings'], ascending=False, inplace=True)
users.columns = ['mean_rating', 'No_of_ratings']
return users.loc[users['No_of_ratings'] > min_ratings_count_for_being_high_active].drop_duplicates(
'mean_rating').drop_duplicates('No_of_ratings').index.values
def __drop_low_active_user_ratings_only_keep_high_ones(netflix_users, ratings):
return ratings.loc[(ratings['user_id'].isin(netflix_users))]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment