Skip to content

Instantly share code, notes, and snippets.

@katipogluMustafa
Created November 15, 2020 22:48
Show Gist options
  • Save katipogluMustafa/7a2d86b02ae339cce20730fee6b1a662 to your computer and use it in GitHub Desktop.
Save katipogluMustafa/7a2d86b02ae339cce20730fee6b1a662 to your computer and use it in GitHub Desktop.
class NetflixDataset(Dataset):
def load_ratings(ratings_path, ratings_col_names):
if not os.path.isfile(ratings_path) or not ratings_col_names:
return None
ratings_raw = pd.read_csv(ratings_path, header=None, names=['user_id', 'rating', 'timestamp'], usecols=[0, 1, 2])
ratings_raw['rating'] = ratings_raw['rating'].astype(float)
# Find empty rows to slice dataframe for each movie
temp_movies = ratings_raw[ratings_raw['rating'].isna()]['user_id'].reset_index()
movie_indexes = [[index, int(movie[:-1])] for index, movie in temp_movies.values]
# Shift the movie_indexes by one to get start and endpoints of all movies
shifted_movie_indexes = deque(movie_indexes)
shifted_movie_indexes.rotate(-1)
# Gather all dataframes
user_data = []
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indexes, shifted_movie_indexes):
# if it is the last movie in the file
if df_id_1 < df_id_2:
temp_df = ratings_raw[(df_id_1+1):(df_id_2-1)].copy()
else:
temp_df = ratings_raw[df_id_1+1:].copy()
# Create movie id column
temp_df['item_id'] = movie_id
# Append dataframe to list
user_data.append(temp_df)
# Combile all dataframes
ratings = pd.concat(user_data)
del user_data, ratings_raw, temp_movies, temp_df, shifted_movie_indexes, movie_indexes, df_id_1, df_id_2, movie_id, next_movie_id
# Convert the column order to the same order as the MovieLens dataset for ease of use
ratings = ratings.reindex(columns=['user_id', 'item_id', 'rating', 'timestamp'])
# Convert the string timestamps into datetime type
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], infer_datetime_format=True)
# Convert the string user ids into int
ratings['user_id'] = ratings['user_id'].astype(int)
#### !!! Drop These two lines if No Memory and Time Constraint are set.
netflix_users = NetflixDataset.get_filtered_netflix_users(ratings)
ratings = ratings.loc[ (ratings['user_id'].isin(netflix_users)) ]
return ratings
def get_filtered_netflix_users(ratings):
data = ratings
active_users = pd.DataFrame(data.groupby('user_id')['rating'].mean())
active_users['No_of_ratings'] = pd.DataFrame(data.groupby('user_id')['rating'].count())
active_users.sort_values(by=['No_of_ratings'], ascending=False, inplace=True)
active_users.columns = ['mean_rating', 'No_of_ratings']
return active_users.loc[active_users['No_of_ratings'] > 40].drop_duplicates('mean_rating').drop_duplicates('No_of_ratings').index.values
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment