Last active
June 29, 2017 22:46
-
-
Save robinkraft/18420a3fc8f2032811ea9770b2d1ce0c to your computer and use it in GitHub Desktop.
Script to process dogs vs. cats data from Kaggle for use with VGG ImageNet submission. See https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import random | |
import glob | |
import shutil | |
def get_dc_paths(path): | |
"""Given a path filled with dog & cat files, make separate lists of both.""" | |
cats = [] | |
dogs = [] | |
files = glob.glob('{}/*'.format(path)) | |
for f in files: | |
if 'cat' in f: | |
cats.append(f) | |
if 'dog' in f: | |
dogs.append(f) | |
return dogs, cats | |
def copy_files(raw_path, clean_path, files, animal, train_or_valid, sample=False): | |
"""Copy files from the raw path into your desired location. | |
Args: | |
raw_path: location of raw data as downloaded | |
clean_path: desired output location | |
files: list of files | |
animal: specify 'dog' or 'cat' - used for directory names | |
train_or_valid: specify 'train' or 'valid' - used for directory names | |
sample: boolean (True or False) - used for prepping sample data directory""" | |
if not sample: | |
# normal setup | |
tail = '{}/{}/'.format(train_or_valid, animal) | |
animal_path = os.path.join(clean_path, tail) | |
else: | |
# sample directory setup - just add a 'sample' subdirectory | |
tail = '{}/{}/{}/'.format('sample', train_or_valid, animal) | |
animal_path = os.path.join(clean_path, tail) | |
if os.path.isdir(animal_path): | |
# this may not work the first time - delete all the copied files | |
# and start over | |
print 'Deleting previously copied \'{}\' files'.format(animal) | |
shutil.rmtree(animal_path) | |
# create output directories and any parent directories | |
os.makedirs(animal_path) | |
n = 0 | |
# do the copy, keeping track of number of files for basic logging | |
for fname in files: | |
path = os.path.join(raw_path, fname) | |
shutil.copy(path, animal_path) | |
n += 1 | |
print 'Copied {} {} files'.format(n, animal) | |
return | |
def process_animal(raw, clean, animal, paths, valid_size, sample_size): | |
# process dogs - randomize order of paths list to get started | |
random.shuffle(paths) | |
# validation data is the first N paths | |
valid = paths[:valid_size] | |
# validation data is everything else | |
train = paths[valid_size:] | |
# do the copy of dogs for training and validation | |
copy_files(raw, clean, train, animal, 'train') | |
copy_files(raw, clean, valid, animal, 'valid') | |
# do the same thing for sample training data, making a validation | |
# data set 20% the size of the training sample size | |
print 'Sampling {}'.format(animal) | |
train = paths[:sample_size] | |
valid = paths[-int(sample_size * 0.2):] | |
copy_files(raw, clean, train, animal, 'train', sample=True) | |
copy_files(raw, clean, valid, animal, 'valid', sample=True) | |
return | |
def process_test(raw, clean, paths): | |
"""Copy test data into the right place for use with the VGG model.""" | |
print 'Copying test data' | |
raw_path = os.path.join(raw, 'test/') | |
clean_path = os.path.join(clean, 'test/mix') | |
if os.path.isdir(clean_path): | |
shutil.rmtree(clean_path) | |
shutil.copytree(raw_path, clean_path) | |
return | |
def main(raw, clean, valid_size=1000, sample_size=100): | |
# in case your path contains ~/ | |
raw = os.path.expanduser(raw) | |
clean = os.path.expanduser(clean) | |
test_path = os.path.join(clean, 'test') | |
training_path = os.path.join(raw, 'train') | |
dog_paths, cat_paths = get_dc_paths(training_path) | |
process_animal(raw, clean, 'dogs', dog_paths, valid_size, sample_size) | |
process_animal(raw, clean, 'cats', cat_paths, valid_size, sample_size) | |
process_test(raw, clean, test_path) | |
print 'All done!' | |
return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment