Created
March 20, 2019 08:13
-
-
Save toshihiroryuu/a7c4f88b30b954f00dfa22a087fb1f10 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import math | |
import os | |
PATH=os.getcwd() | |
print("This is where the train, val and test files will be at {}".format(PATH)) | |
DATASET_FILE = '/home/Pictures/udacity_driving_datasets/labels_trainval.csv' | |
FILE_TRAIN = os.path.join(PATH, 'train.csv') | |
FILE_VALID = os.path.join(PATH, 'validation.csv') | |
FILE_TESTS = os.path.join(PATH, 'test.csv') | |
# Set to true if you want to copy first line from main | |
# file into each split (like CSV header) | |
IS_CSV = True | |
# Make sure it adds to 100, no error checking below | |
PERCENT_TRAIN = 60 | |
PERCENT_VALID = 20 | |
PERCENT_TESTS = 20 | |
data = [l for l in open(DATASET_FILE, 'r')] | |
train_file = open(FILE_TRAIN, 'w') | |
valid_file = open(FILE_VALID, 'w') | |
tests_file = open(FILE_TESTS, 'w') | |
if IS_CSV: | |
train_file.write(data[0]) | |
valid_file.write(data[0]) | |
tests_file.write(data[0]) | |
data = data[1:len(data)] | |
num_of_data = len(data) | |
num_train = int((PERCENT_TRAIN/100.0)*num_of_data) | |
num_valid = int((PERCENT_VALID/100.0)*num_of_data) | |
num_tests = int((PERCENT_TESTS/100.0)*num_of_data) | |
print("No of rows in Dataset is {}".format(num_of_data)) | |
print("No of rows in Train Dataset is {}".format(num_train)) | |
print("No of rows in Validation Dataset is {}".format(num_valid)) | |
print("No of rows in Test Dataset is {}".format(num_tests)) | |
data_fractions = [num_train, num_valid, num_tests] | |
split_data = [[],[],[]] | |
rand_data_ind = 0 | |
for split_ind, fraction in enumerate(data_fractions): | |
for i in range(fraction): | |
rand_data_ind = random.randint(0, len(data)-1) | |
split_data[split_ind].append(data[rand_data_ind]) | |
data.pop(rand_data_ind) | |
for l in split_data[0]: | |
train_file.write(l) | |
for l in split_data[1]: | |
valid_file.write(l) | |
for l in split_data[2]: | |
tests_file.write(l) | |
train_file.close() | |
valid_file.close() | |
tests_file.close() |
Author
toshihiroryuu
commented
Mar 20, 2019
- Replace DATASET_FILE with your file path
- train.csv, validation.csv, test.csv will be created at the current working directory
- IS_CSV=True will copy the header to all the splits
- IS_CSV=False for ignoring the header to all splits
- Specify percentage of split at PERCENT_TRAIN, PERCENT_VALID, PERCENT_TEST .Make sure the sum is 100
- Check the current working directory for the new created csv files.
- Happy coding
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment