katipogluMustafa/load_ratings.py

## load_ratings.py
class NetflixDataset(Dataset):
    def load_ratings(ratings_path, ratings_col_names):
        if not os.path.isfile(ratings_path) or not ratings_col_names:
            return None

        ratings_raw = pd.read_csv(ratings_path, header=None, names=['user_id', 'rating', 'timestamp'], usecols=[0, 1, 2])
        ratings_raw['rating'] = ratings_raw['rating'].astype(float)
        # Find empty rows to slice dataframe for each movie
        temp_movies = ratings_raw[ratings_raw['rating'].isna()]['user_id'].reset_index()
        movie_indexes = [[index, int(movie[:-1])] for index, movie in temp_movies.values]

        # Shift the movie_indexes by one to get start and endpoints of all movies
        shifted_movie_indexes = deque(movie_indexes)
        shifted_movie_indexes.rotate(-1)

        # Gather all dataframes
        user_data = []
        for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indexes, shifted_movie_indexes):

            # if it is the last movie in the file
            if df_id_1 < df_id_2:
                temp_df = ratings_raw[(df_id_1+1):(df_id_2-1)].copy()
            else:
                temp_df = ratings_raw[df_id_1+1:].copy()

            # Create movie id column
            temp_df['item_id'] = movie_id

            # Append dataframe to list

            user_data.append(temp_df)

        # Combile all  dataframes
        ratings = pd.concat(user_data)
        del user_data, ratings_raw, temp_movies, temp_df, shifted_movie_indexes, movie_indexes, df_id_1, df_id_2, movie_id, next_movie_id

        # Convert the column order to the same order as the MovieLens dataset for ease of use
        ratings = ratings.reindex(columns=['user_id', 'item_id', 'rating', 'timestamp'])

        # Convert the string timestamps into datetime type
        ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], infer_datetime_format=True)

        # Convert the string user ids into int
        ratings['user_id'] = ratings['user_id'].astype(int)

        #### !!! Drop These two lines if No Memory and Time Constraint are set.
        netflix_users = NetflixDataset.get_filtered_netflix_users(ratings)
        ratings = ratings.loc[ (ratings['user_id'].isin(netflix_users)) ]

        return ratings

    def get_filtered_netflix_users(ratings):
        data = ratings
        active_users = pd.DataFrame(data.groupby('user_id')['rating'].mean())
        active_users['No_of_ratings'] = pd.DataFrame(data.groupby('user_id')['rating'].count())
        active_users.sort_values(by=['No_of_ratings'], ascending=False, inplace=True)
        active_users.columns = ['mean_rating', 'No_of_ratings']
        return active_users.loc[active_users['No_of_ratings'] > 40].drop_duplicates('mean_rating').drop_duplicates('No_of_ratings').index.values
	class NetflixDataset(Dataset):
	def load_ratings(ratings_path, ratings_col_names):
	if not os.path.isfile(ratings_path) or not ratings_col_names:
	return None

	ratings_raw = pd.read_csv(ratings_path, header=None, names=['user_id', 'rating', 'timestamp'], usecols=[0, 1, 2])
	ratings_raw['rating'] = ratings_raw['rating'].astype(float)
	# Find empty rows to slice dataframe for each movie
	temp_movies = ratings_raw[ratings_raw['rating'].isna()]['user_id'].reset_index()
	movie_indexes = [[index, int(movie[:-1])] for index, movie in temp_movies.values]

	# Shift the movie_indexes by one to get start and endpoints of all movies
	shifted_movie_indexes = deque(movie_indexes)
	shifted_movie_indexes.rotate(-1)

	# Gather all dataframes
	user_data = []
	for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indexes, shifted_movie_indexes):

	# if it is the last movie in the file
	if df_id_1 < df_id_2:
	temp_df = ratings_raw[(df_id_1+1):(df_id_2-1)].copy()
	else:
	temp_df = ratings_raw[df_id_1+1:].copy()

	# Create movie id column
	temp_df['item_id'] = movie_id

	# Append dataframe to list

	user_data.append(temp_df)

	# Combile all dataframes
	ratings = pd.concat(user_data)
	del user_data, ratings_raw, temp_movies, temp_df, shifted_movie_indexes, movie_indexes, df_id_1, df_id_2, movie_id, next_movie_id

	# Convert the column order to the same order as the MovieLens dataset for ease of use
	ratings = ratings.reindex(columns=['user_id', 'item_id', 'rating', 'timestamp'])

	# Convert the string timestamps into datetime type
	ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], infer_datetime_format=True)

	# Convert the string user ids into int
	ratings['user_id'] = ratings['user_id'].astype(int)

	#### !!! Drop These two lines if No Memory and Time Constraint are set.
	netflix_users = NetflixDataset.get_filtered_netflix_users(ratings)
	ratings = ratings.loc[ (ratings['user_id'].isin(netflix_users)) ]

	return ratings

	def get_filtered_netflix_users(ratings):
	data = ratings
	active_users = pd.DataFrame(data.groupby('user_id')['rating'].mean())
	active_users['No_of_ratings'] = pd.DataFrame(data.groupby('user_id')['rating'].count())
	active_users.sort_values(by=['No_of_ratings'], ascending=False, inplace=True)
	active_users.columns = ['mean_rating', 'No_of_ratings']
	return active_users.loc[active_users['No_of_ratings'] > 40].drop_duplicates('mean_rating').drop_duplicates('No_of_ratings').index.values