robinkraft/prep_data.py

## prep_data.py
import os
import random
import glob
import shutil


def get_dc_paths(path):
    """Given a path filled with dog & cat files, make separate lists of both."""
    cats = []
    dogs = []

    files = glob.glob('{}/*'.format(path))

    for f in files:
        if 'cat' in f:
            cats.append(f)
        if 'dog' in f:
            dogs.append(f)

    return dogs, cats


def copy_files(raw_path, clean_path, files, animal, train_or_valid, sample=False):
    """Copy files from the raw path into your desired location.

    Args:
        raw_path: location of raw data as downloaded
        clean_path: desired output location
        files: list of files
        animal: specify 'dog' or 'cat' - used for directory names
        train_or_valid: specify 'train' or 'valid' - used for directory names
        sample: boolean (True or False) - used for prepping sample data directory"""
    if not sample:
        # normal setup
        tail = '{}/{}/'.format(train_or_valid, animal)
        animal_path = os.path.join(clean_path, tail)
    else:
        # sample directory setup - just add a 'sample' subdirectory
        tail = '{}/{}/{}/'.format('sample', train_or_valid, animal)
        animal_path = os.path.join(clean_path, tail)

    if os.path.isdir(animal_path):
        # this may not work the first time - delete all the copied files
        # and start over
        print 'Deleting previously copied \'{}\' files'.format(animal)
        shutil.rmtree(animal_path)

    # create output directories and any parent directories
    os.makedirs(animal_path)

    n = 0

    # do the copy, keeping track of number of files for basic logging
    for fname in files:
        path = os.path.join(raw_path, fname)
        shutil.copy(path, animal_path)

        n += 1

    print 'Copied {} {} files'.format(n, animal)

    return


def process_animal(raw, clean, animal, paths, valid_size, sample_size):
    # process dogs - randomize order of paths list to get started
    random.shuffle(paths)

    # validation data is the first N paths
    valid = paths[:valid_size]

    # validation data is everything else
    train = paths[valid_size:]

    # do the copy of dogs for training and validation
    copy_files(raw, clean, train, animal, 'train')
    copy_files(raw, clean, valid, animal, 'valid')

    # do the same thing for sample training data, making a validation
    # data set 20% the size of the training sample size
    print 'Sampling {}'.format(animal)
    train = paths[:sample_size]
    valid = paths[-int(sample_size * 0.2):]
    copy_files(raw, clean, train, animal, 'train', sample=True)
    copy_files(raw, clean, valid, animal, 'valid', sample=True)

    return


def process_test(raw, clean, paths):
    """Copy test data into the right place for use with the VGG model."""
    print 'Copying test data'
    raw_path = os.path.join(raw, 'test/')
    clean_path = os.path.join(clean, 'test/mix')

    if os.path.isdir(clean_path):
        shutil.rmtree(clean_path)

    shutil.copytree(raw_path, clean_path)

    return


def main(raw, clean, valid_size=1000, sample_size=100):
    # in case your path contains ~/
    raw = os.path.expanduser(raw)
    clean = os.path.expanduser(clean)

    test_path = os.path.join(clean, 'test')

    training_path = os.path.join(raw, 'train')

    dog_paths, cat_paths = get_dc_paths(training_path)

    process_animal(raw, clean, 'dogs', dog_paths, valid_size, sample_size)
    process_animal(raw, clean, 'cats', cat_paths, valid_size, sample_size)
    process_test(raw, clean, test_path)

    print 'All done!'

    return
	import os
	import random
	import glob
	import shutil


	def get_dc_paths(path):
	"""Given a path filled with dog & cat files, make separate lists of both."""
	cats = []
	dogs = []

	files = glob.glob('{}/*'.format(path))

	for f in files:
	if 'cat' in f:
	cats.append(f)
	if 'dog' in f:
	dogs.append(f)

	return dogs, cats


	def copy_files(raw_path, clean_path, files, animal, train_or_valid, sample=False):
	"""Copy files from the raw path into your desired location.

	Args:
	raw_path: location of raw data as downloaded
	clean_path: desired output location
	files: list of files
	animal: specify 'dog' or 'cat' - used for directory names
	train_or_valid: specify 'train' or 'valid' - used for directory names
	sample: boolean (True or False) - used for prepping sample data directory"""
	if not sample:
	# normal setup
	tail = '{}/{}/'.format(train_or_valid, animal)
	animal_path = os.path.join(clean_path, tail)
	else:
	# sample directory setup - just add a 'sample' subdirectory
	tail = '{}/{}/{}/'.format('sample', train_or_valid, animal)
	animal_path = os.path.join(clean_path, tail)

	if os.path.isdir(animal_path):
	# this may not work the first time - delete all the copied files
	# and start over
	print 'Deleting previously copied \'{}\' files'.format(animal)
	shutil.rmtree(animal_path)

	# create output directories and any parent directories
	os.makedirs(animal_path)

	n = 0

	# do the copy, keeping track of number of files for basic logging
	for fname in files:
	path = os.path.join(raw_path, fname)
	shutil.copy(path, animal_path)

	n += 1

	print 'Copied {} {} files'.format(n, animal)

	return


	def process_animal(raw, clean, animal, paths, valid_size, sample_size):
	# process dogs - randomize order of paths list to get started
	random.shuffle(paths)

	# validation data is the first N paths
	valid = paths[:valid_size]

	# validation data is everything else
	train = paths[valid_size:]

	# do the copy of dogs for training and validation
	copy_files(raw, clean, train, animal, 'train')
	copy_files(raw, clean, valid, animal, 'valid')

	# do the same thing for sample training data, making a validation
	# data set 20% the size of the training sample size
	print 'Sampling {}'.format(animal)
	train = paths[:sample_size]
	valid = paths[-int(sample_size * 0.2):]
	copy_files(raw, clean, train, animal, 'train', sample=True)
	copy_files(raw, clean, valid, animal, 'valid', sample=True)

	return


	def process_test(raw, clean, paths):
	"""Copy test data into the right place for use with the VGG model."""
	print 'Copying test data'
	raw_path = os.path.join(raw, 'test/')
	clean_path = os.path.join(clean, 'test/mix')

	if os.path.isdir(clean_path):
	shutil.rmtree(clean_path)

	shutil.copytree(raw_path, clean_path)

	return


	def main(raw, clean, valid_size=1000, sample_size=100):
	# in case your path contains ~/
	raw = os.path.expanduser(raw)
	clean = os.path.expanduser(clean)

	test_path = os.path.join(clean, 'test')

	training_path = os.path.join(raw, 'train')

	dog_paths, cat_paths = get_dc_paths(training_path)

	process_animal(raw, clean, 'dogs', dog_paths, valid_size, sample_size)
	process_animal(raw, clean, 'cats', cat_paths, valid_size, sample_size)
	process_test(raw, clean, test_path)

	print 'All done!'

	return