Skip to content

Instantly share code, notes, and snippets.

@sbalnojan
Created June 20, 2019 19:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sbalnojan/2c0effd09e74e8d9f8c6273e82a5dd98 to your computer and use it in GitHub Desktop.
Save sbalnojan/2c0effd09e74e8d9f8c6273e82a5dd98 to your computer and use it in GitHub Desktop.
import os
import random
def convert_path(fname):
basename, _ = os.path.splitext(fname)
out_dir = basename + '_partitioned'
return out_dir
def random_split_file(fpath):
root = os.path.dirname(fpath)
output_paths = [
os.path.join(root, FILENAMES['train']),
os.path.join(root, FILENAMES['test']),
]
if all(os.path.exists(path) for path in output_paths):
print("Found some files that indicate that the input data "
"has already been shuffled and split, not doing it again.")
print("These files are: %s" % ", ".join(output_paths))
return
print('Shuffling and splitting train/test file. This may take a while.')
train_file = os.path.join(root, FILENAMES['train'])
test_file = os.path.join(root, FILENAMES['test'])
print('Reading data from file: ', fpath)
with open(fpath, "rt") as in_tf:
lines = in_tf.readlines()
# The first few lines are comments
lines = lines[4:]
print('Shuffling data')
random.shuffle(lines)
split_len = int(len(lines) * TRAIN_FRACTION)
print('Splitting to train and test files')
with open(train_file, "wt") as out_tf_train:
for line in lines[:split_len]:
out_tf_train.write(line)
with open(test_file, "wt") as out_tf_test:
for line in lines[split_len:]:
out_tf_test.write(line)
DATA_PATH = "data/example_1/example.txt"
DATA_DIR = "data/example_1"
CONFIG_PATH = "config_1.py"
FILENAMES = {
'train': 'train.txt',
'test': 'test.txt',
}
TRAIN_FRACTION = 0.75
# ----------------------------------------------------------------------------------------------------------------------
#
random_split_file(DATA_PATH)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment