sbalnojan/biggraph_gist_1.py

## biggraph_gist_1.py
import os
import random

def convert_path(fname):
    basename, _ = os.path.splitext(fname)
    out_dir = basename + '_partitioned'
    return out_dir

def random_split_file(fpath):
    root = os.path.dirname(fpath)

    output_paths = [
        os.path.join(root, FILENAMES['train']),
        os.path.join(root, FILENAMES['test']),
    ]
    if all(os.path.exists(path) for path in output_paths):
        print("Found some files that indicate that the input data "
              "has already been shuffled and split, not doing it again.")
        print("These files are: %s" % ", ".join(output_paths))
        return

    print('Shuffling and splitting train/test file. This may take a while.')
    train_file = os.path.join(root, FILENAMES['train'])
    test_file = os.path.join(root, FILENAMES['test'])

    print('Reading data from file: ', fpath)
    with open(fpath, "rt") as in_tf:
        lines = in_tf.readlines()

    # The first few lines are comments
    lines = lines[4:]
    print('Shuffling data')
    random.shuffle(lines)
    split_len = int(len(lines) * TRAIN_FRACTION)

    print('Splitting to train and test files')
    with open(train_file, "wt") as out_tf_train:
        for line in lines[:split_len]:
            out_tf_train.write(line)

    with open(test_file, "wt") as out_tf_test:
        for line in lines[split_len:]:
            out_tf_test.write(line)

DATA_PATH = "data/example_1/example.txt"
DATA_DIR = "data/example_1"
CONFIG_PATH = "config_1.py"
FILENAMES = {
    'train': 'train.txt',
    'test': 'test.txt',
}
TRAIN_FRACTION = 0.75

# ----------------------------------------------------------------------------------------------------------------------
#

random_split_file(DATA_PATH)
	import os
	import random

	def convert_path(fname):
	basename, _ = os.path.splitext(fname)
	out_dir = basename + '_partitioned'
	return out_dir

	def random_split_file(fpath):
	root = os.path.dirname(fpath)

	output_paths = [
	os.path.join(root, FILENAMES['train']),
	os.path.join(root, FILENAMES['test']),
	]
	if all(os.path.exists(path) for path in output_paths):
	print("Found some files that indicate that the input data "
	"has already been shuffled and split, not doing it again.")
	print("These files are: %s" % ", ".join(output_paths))
	return

	print('Shuffling and splitting train/test file. This may take a while.')
	train_file = os.path.join(root, FILENAMES['train'])
	test_file = os.path.join(root, FILENAMES['test'])

	print('Reading data from file: ', fpath)
	with open(fpath, "rt") as in_tf:
	lines = in_tf.readlines()

	# The first few lines are comments
	lines = lines[4:]
	print('Shuffling data')
	random.shuffle(lines)
	split_len = int(len(lines) * TRAIN_FRACTION)

	print('Splitting to train and test files')
	with open(train_file, "wt") as out_tf_train:
	for line in lines[:split_len]:
	out_tf_train.write(line)

	with open(test_file, "wt") as out_tf_test:
	for line in lines[split_len:]:
	out_tf_test.write(line)

	DATA_PATH = "data/example_1/example.txt"
	DATA_DIR = "data/example_1"
	CONFIG_PATH = "config_1.py"
	FILENAMES = {
	'train': 'train.txt',
	'test': 'test.txt',
	}
	TRAIN_FRACTION = 0.75

	# ----------------------------------------------------------------------------------------------------------------------
	#

	random_split_file(DATA_PATH)