Created
June 20, 2019 19:47
-
-
Save sbalnojan/2c0effd09e74e8d9f8c6273e82a5dd98 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import random | |
def convert_path(fname): | |
basename, _ = os.path.splitext(fname) | |
out_dir = basename + '_partitioned' | |
return out_dir | |
def random_split_file(fpath): | |
root = os.path.dirname(fpath) | |
output_paths = [ | |
os.path.join(root, FILENAMES['train']), | |
os.path.join(root, FILENAMES['test']), | |
] | |
if all(os.path.exists(path) for path in output_paths): | |
print("Found some files that indicate that the input data " | |
"has already been shuffled and split, not doing it again.") | |
print("These files are: %s" % ", ".join(output_paths)) | |
return | |
print('Shuffling and splitting train/test file. This may take a while.') | |
train_file = os.path.join(root, FILENAMES['train']) | |
test_file = os.path.join(root, FILENAMES['test']) | |
print('Reading data from file: ', fpath) | |
with open(fpath, "rt") as in_tf: | |
lines = in_tf.readlines() | |
# The first few lines are comments | |
lines = lines[4:] | |
print('Shuffling data') | |
random.shuffle(lines) | |
split_len = int(len(lines) * TRAIN_FRACTION) | |
print('Splitting to train and test files') | |
with open(train_file, "wt") as out_tf_train: | |
for line in lines[:split_len]: | |
out_tf_train.write(line) | |
with open(test_file, "wt") as out_tf_test: | |
for line in lines[split_len:]: | |
out_tf_test.write(line) | |
DATA_PATH = "data/example_1/example.txt" | |
DATA_DIR = "data/example_1" | |
CONFIG_PATH = "config_1.py" | |
FILENAMES = { | |
'train': 'train.txt', | |
'test': 'test.txt', | |
} | |
TRAIN_FRACTION = 0.75 | |
# ---------------------------------------------------------------------------------------------------------------------- | |
# | |
random_split_file(DATA_PATH) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment