Skip to content

Instantly share code, notes, and snippets.

@katipogluMustafa
Last active November 15, 2020 22:11
Show Gist options
  • Save katipogluMustafa/45879c6314798a5270fb8d881f4f767f to your computer and use it in GitHub Desktop.
Save katipogluMustafa/45879c6314798a5270fb8d881f4f767f to your computer and use it in GitHub Desktop.
Loads Netflix Prize Dataset Movies
class NetflixDataset(Dataset):
def load_movies(self):
movies = self.__read_movies_data_from_file()
movies = NetflixDataset.__replace_invalid_years_with_zero(movies)
movies = NetflixDataset.__convert_movies_year_format_to_int(movies)
movies = NetflixDataset.__interchange_movies_title_and_year(movies)
return NetflixDataset.__get_only_first_quarter_of_all_movies_for_performance(movies)
def __init__(self, movies_file_path):
self.movies_file_path = movies_file_path
self.movies_column_names = ('item_id', 'year', 'title')
def __read_movies_data_from_file(self):
return pd.read_csv(self.movies_file_path, encoding='ISO-8859-1', header=None,
names=self.movies_column_names).set_index('item_id')
def __replace_invalid_years_with_zero(movies):
movies['year'].replace([np.inf, -np.inf, np.nan], 0, inplace=True)
return movies
def __convert_movies_year_format_to_int(movies):
movies['year'] = movies['year'].astype(int)
return movies
def __interchange_movies_title_and_year(movies):
return movies.reindex(columns=['title', 'year'])
def __get_only_first_quarter_of_all_movies_for_performance(movies):
return movies[:4499]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment