Skip to content

Instantly share code, notes, and snippets.

@guimatheus92
Last active August 17, 2021 19:54
Show Gist options
  • Save guimatheus92/5bf038f94abe46056b79a0a3a640e1bd to your computer and use it in GitHub Desktop.
Save guimatheus92/5bf038f94abe46056b79a0a3a640e1bd to your computer and use it in GitHub Desktop.
Netflix Recommendation System: From Netflix TXT files, we are going to create a full combined data
# From TXT files, we are going to create a full combined data
# If the file does not exist, we create the file in write mode (w)
if not os.path.isfile(r'kaggle/working/fullcombined_data.csv'):
# Process start time
start = datetime.now()
# Path and files that is going to be combined to a full file
files = [r'kaggle/input/netflix-prize-data/combined_data_1.txt',
r'kaggle/input/netflix-prize-data/combined_data_2.txt',
r'kaggle/input/netflix-prize-data/combined_data_3.txt',
r'kaggle/input/netflix-prize-data/combined_data_4.txt']
filecount = 0
for file in files:
if os.path.isfile(file):
filecount += 1
# If there is any combined_data*.txt file in path
if filecount > 0:
# Open and create file for recording
dataset = open(r'kaggle/working/fullcombined_data.csv', mode = 'w')
# Create list for files rows
rows = list()
# Loop for each TXT file
for file in files:
# Print a message
print("Reading the file {}...".format(file))
# With the file open, we extract the rows
with open(file) as f:
# Loop through each row
for row in f:
# Deleting list content
del rows[:]
# Divide the row of the file by the end of line character
row = row.strip()
# If we find "colon" at the end of the row, we do replace by removing the character, as we just want the movie id
if row.endswith(':'):
movie_id = row.replace(':', '')
# If not, we create a comprehension list to separate the columns by comma
else:
# Split the columns
rows = [x for x in row.split(',')]
# Use movie id at index zero position
rows.insert(0, movie_id)
# Write the result to the new file
dataset.write(','.join(rows))
dataset.write('\n')
print("Finished.\n")
dataset.close()
# Print elapsed time
print('Elapsed time to load all the files:', datetime.now() - start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment