guimatheus92/fullcombined_data.py

## fullcombined_data.py
# From TXT files, we are going to create a full combined data
# If the file does not exist, we create the file in write mode (w)
if not os.path.isfile(r'kaggle/working/fullcombined_data.csv'):
    # Process start time
    start = datetime.now()

    # Path and files that is going to be combined to a full file
    files = [r'kaggle/input/netflix-prize-data/combined_data_1.txt',
                r'kaggle/input/netflix-prize-data/combined_data_2.txt',
                r'kaggle/input/netflix-prize-data/combined_data_3.txt',
                r'kaggle/input/netflix-prize-data/combined_data_4.txt']

    filecount = 0
    for file in files:
        if os.path.isfile(file):
            filecount += 1

    # If there is any combined_data*.txt file in path
    if filecount > 0:
        # Open and create file for recording
        dataset = open(r'kaggle/working/fullcombined_data.csv', mode = 'w')
        # Create list for files rows
        rows = list()

        # Loop for each TXT file
        for file in files:
            # Print a message
            print("Reading the file {}...".format(file))
            # With the file open, we extract the rows
            with open(file) as f:
                # Loop through each row
                for row in f:
                    # Deleting list content
                    del rows[:]
                    # Divide the row of the file by the end of line character
                    row = row.strip()
                    # If we find "colon" at the end of the row, we do replace by removing the character, as we just want the movie id
                    if row.endswith(':'):
                        movie_id = row.replace(':', '')
                    # If not, we create a comprehension list to separate the columns by comma
                    else:
                        # Split the columns
                        rows = [x for x in row.split(',')]
                        # Use movie id at index zero position
                        rows.insert(0, movie_id)
                        # Write the result to the new file
                        dataset.write(','.join(rows))
                        dataset.write('\n')
            print("Finished.\n")
        dataset.close()

        # Print elapsed time
        print('Elapsed time to load all the files:', datetime.now() - start)
	# From TXT files, we are going to create a full combined data
	# If the file does not exist, we create the file in write mode (w)
	if not os.path.isfile(r'kaggle/working/fullcombined_data.csv'):
	# Process start time
	start = datetime.now()

	# Path and files that is going to be combined to a full file
	files = [r'kaggle/input/netflix-prize-data/combined_data_1.txt',
	r'kaggle/input/netflix-prize-data/combined_data_2.txt',
	r'kaggle/input/netflix-prize-data/combined_data_3.txt',
	r'kaggle/input/netflix-prize-data/combined_data_4.txt']

	filecount = 0
	for file in files:
	if os.path.isfile(file):
	filecount += 1

	# If there is any combined_data*.txt file in path
	if filecount > 0:
	# Open and create file for recording
	dataset = open(r'kaggle/working/fullcombined_data.csv', mode = 'w')
	# Create list for files rows
	rows = list()

	# Loop for each TXT file
	for file in files:
	# Print a message
	print("Reading the file {}...".format(file))
	# With the file open, we extract the rows
	with open(file) as f:
	# Loop through each row
	for row in f:
	# Deleting list content
	del rows[:]
	# Divide the row of the file by the end of line character
	row = row.strip()
	# If we find "colon" at the end of the row, we do replace by removing the character, as we just want the movie id
	if row.endswith(':'):
	movie_id = row.replace(':', '')
	# If not, we create a comprehension list to separate the columns by comma
	else:
	# Split the columns
	rows = [x for x in row.split(',')]
	# Use movie id at index zero position
	rows.insert(0, movie_id)
	# Write the result to the new file
	dataset.write(','.join(rows))
	dataset.write('\n')
	print("Finished.\n")
	dataset.close()

	# Print elapsed time
	print('Elapsed time to load all the files:', datetime.now() - start)