Created
November 15, 2020 22:48
-
-
Save katipogluMustafa/7a2d86b02ae339cce20730fee6b1a662 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class NetflixDataset(Dataset): | |
def load_ratings(ratings_path, ratings_col_names): | |
if not os.path.isfile(ratings_path) or not ratings_col_names: | |
return None | |
ratings_raw = pd.read_csv(ratings_path, header=None, names=['user_id', 'rating', 'timestamp'], usecols=[0, 1, 2]) | |
ratings_raw['rating'] = ratings_raw['rating'].astype(float) | |
# Find empty rows to slice dataframe for each movie | |
temp_movies = ratings_raw[ratings_raw['rating'].isna()]['user_id'].reset_index() | |
movie_indexes = [[index, int(movie[:-1])] for index, movie in temp_movies.values] | |
# Shift the movie_indexes by one to get start and endpoints of all movies | |
shifted_movie_indexes = deque(movie_indexes) | |
shifted_movie_indexes.rotate(-1) | |
# Gather all dataframes | |
user_data = [] | |
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indexes, shifted_movie_indexes): | |
# if it is the last movie in the file | |
if df_id_1 < df_id_2: | |
temp_df = ratings_raw[(df_id_1+1):(df_id_2-1)].copy() | |
else: | |
temp_df = ratings_raw[df_id_1+1:].copy() | |
# Create movie id column | |
temp_df['item_id'] = movie_id | |
# Append dataframe to list | |
user_data.append(temp_df) | |
# Combile all dataframes | |
ratings = pd.concat(user_data) | |
del user_data, ratings_raw, temp_movies, temp_df, shifted_movie_indexes, movie_indexes, df_id_1, df_id_2, movie_id, next_movie_id | |
# Convert the column order to the same order as the MovieLens dataset for ease of use | |
ratings = ratings.reindex(columns=['user_id', 'item_id', 'rating', 'timestamp']) | |
# Convert the string timestamps into datetime type | |
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], infer_datetime_format=True) | |
# Convert the string user ids into int | |
ratings['user_id'] = ratings['user_id'].astype(int) | |
#### !!! Drop These two lines if No Memory and Time Constraint are set. | |
netflix_users = NetflixDataset.get_filtered_netflix_users(ratings) | |
ratings = ratings.loc[ (ratings['user_id'].isin(netflix_users)) ] | |
return ratings | |
def get_filtered_netflix_users(ratings): | |
data = ratings | |
active_users = pd.DataFrame(data.groupby('user_id')['rating'].mean()) | |
active_users['No_of_ratings'] = pd.DataFrame(data.groupby('user_id')['rating'].count()) | |
active_users.sort_values(by=['No_of_ratings'], ascending=False, inplace=True) | |
active_users.columns = ['mean_rating', 'No_of_ratings'] | |
return active_users.loc[active_users['No_of_ratings'] > 40].drop_duplicates('mean_rating').drop_duplicates('No_of_ratings').index.values | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment