Created
November 15, 2020 22:54
-
-
Save katipogluMustafa/4bd74bb920bab749082bcfd9a1846d2e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class NetflixDataset(Dataset): | |
def load_ratings(self): | |
if not Dataset.is_valid_input_file(self.ratings_file_path): | |
raise InvalidDatasetInputFilePath | |
unstructured_ratings = self.__read_ratings_data_from_file() | |
ratings = NetflixDataset.__structure_ratings_dataframe(unstructured_ratings) | |
ratings = NetflixDataset.__reduce_dataset_size_by_removing_low_active_user_data(ratings) | |
Dataset.sort_ratings_by_timestamp(ratings) | |
return ratings | |
def __read_ratings_data_from_file(self): | |
return pd.read_csv(self.ratings_file_path, header=None, names=self.ratings_column_names, usecols=[0, 1, 2]) | |
def __structure_ratings_dataframe(ratings_raw): | |
NetflixDataset.__convert_ratings_rating_column_format_to_float(ratings_raw) | |
ratings = NetflixDataset.__convert_netflix_ratings_format_to_standard_movielens_like_format(ratings_raw) | |
NetflixDataset.__convert_ratings_column_order_similar_to_movielens(ratings) | |
NetflixDataset.__convert_ratings_timestamp_format_to_datetime(ratings) | |
NetflixDataset.__convert_ratings_user_id_format_to_int(ratings) | |
return ratings | |
def __convert_ratings_rating_column_format_to_float(ratings): | |
ratings['rating'] = ratings['rating'].astype(float) | |
def __convert_netflix_ratings_format_to_standard_movielens_like_format(ratings_raw): | |
temp_movies = ratings_raw[ratings_raw['rating'].isna()]['user_id'].reset_index() | |
movie_indexes = [[index, int(movie[:-1])] for index, movie in temp_movies.values] | |
shifted_movie_indexes = deque(movie_indexes) | |
shifted_movie_indexes.rotate(-1) | |
user_data = [] | |
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indexes, shifted_movie_indexes): | |
if df_id_1 < df_id_2: | |
temp_df = ratings_raw[(df_id_1 + 1):(df_id_2 - 1)].copy() | |
else: | |
temp_df = ratings_raw[df_id_1 + 1:].copy() | |
temp_df['item_id'] = movie_id | |
user_data.append(temp_df) | |
return pd.concat(user_data) | |
def __convert_ratings_user_id_format_to_int(ratings): | |
ratings['user_id'] = ratings['user_id'].astype(int) | |
def __convert_ratings_timestamp_format_to_datetime(ratings): | |
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], infer_datetime_format=True) | |
def __convert_ratings_column_order_similar_to_movielens(ratings): | |
ratings.reindex(columns=['user_id', 'item_id', 'rating', 'timestamp']) | |
def __reduce_dataset_size_by_removing_low_active_user_data(ratings): | |
netflix_users = NetflixDataset.__get_high_active_user_list(ratings, 40) | |
ratings = NetflixDataset.__drop_low_active_user_ratings_only_keep_high_ones(netflix_users, ratings) | |
return ratings | |
def __get_high_active_user_list(ratings, min_ratings_count_for_being_high_active): | |
data = ratings.copy(deep=True) | |
users = pd.DataFrame(data.groupby('user_id')['rating'].mean()) | |
users['No_of_ratings'] = pd.DataFrame(data.groupby('user_id')['rating'].count()) | |
users.sort_values(by=['No_of_ratings'], ascending=False, inplace=True) | |
users.columns = ['mean_rating', 'No_of_ratings'] | |
return users.loc[users['No_of_ratings'] > min_ratings_count_for_being_high_active].drop_duplicates( | |
'mean_rating').drop_duplicates('No_of_ratings').index.values | |
def __drop_low_active_user_ratings_only_keep_high_ones(netflix_users, ratings): | |
return ratings.loc[(ratings['user_id'].isin(netflix_users))] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment