Skip to content

Instantly share code, notes, and snippets.

@robinkraft
Last active June 29, 2017 22:46
Show Gist options
  • Save robinkraft/18420a3fc8f2032811ea9770b2d1ce0c to your computer and use it in GitHub Desktop.
Save robinkraft/18420a3fc8f2032811ea9770b2d1ce0c to your computer and use it in GitHub Desktop.
Script to process dogs vs. cats data from Kaggle for use with VGG ImageNet submission. See https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/
import os
import random
import glob
import shutil
def get_dc_paths(path):
"""Given a path filled with dog & cat files, make separate lists of both."""
cats = []
dogs = []
files = glob.glob('{}/*'.format(path))
for f in files:
if 'cat' in f:
cats.append(f)
if 'dog' in f:
dogs.append(f)
return dogs, cats
def copy_files(raw_path, clean_path, files, animal, train_or_valid, sample=False):
"""Copy files from the raw path into your desired location.
Args:
raw_path: location of raw data as downloaded
clean_path: desired output location
files: list of files
animal: specify 'dog' or 'cat' - used for directory names
train_or_valid: specify 'train' or 'valid' - used for directory names
sample: boolean (True or False) - used for prepping sample data directory"""
if not sample:
# normal setup
tail = '{}/{}/'.format(train_or_valid, animal)
animal_path = os.path.join(clean_path, tail)
else:
# sample directory setup - just add a 'sample' subdirectory
tail = '{}/{}/{}/'.format('sample', train_or_valid, animal)
animal_path = os.path.join(clean_path, tail)
if os.path.isdir(animal_path):
# this may not work the first time - delete all the copied files
# and start over
print 'Deleting previously copied \'{}\' files'.format(animal)
shutil.rmtree(animal_path)
# create output directories and any parent directories
os.makedirs(animal_path)
n = 0
# do the copy, keeping track of number of files for basic logging
for fname in files:
path = os.path.join(raw_path, fname)
shutil.copy(path, animal_path)
n += 1
print 'Copied {} {} files'.format(n, animal)
return
def process_animal(raw, clean, animal, paths, valid_size, sample_size):
# process dogs - randomize order of paths list to get started
random.shuffle(paths)
# validation data is the first N paths
valid = paths[:valid_size]
# validation data is everything else
train = paths[valid_size:]
# do the copy of dogs for training and validation
copy_files(raw, clean, train, animal, 'train')
copy_files(raw, clean, valid, animal, 'valid')
# do the same thing for sample training data, making a validation
# data set 20% the size of the training sample size
print 'Sampling {}'.format(animal)
train = paths[:sample_size]
valid = paths[-int(sample_size * 0.2):]
copy_files(raw, clean, train, animal, 'train', sample=True)
copy_files(raw, clean, valid, animal, 'valid', sample=True)
return
def process_test(raw, clean, paths):
"""Copy test data into the right place for use with the VGG model."""
print 'Copying test data'
raw_path = os.path.join(raw, 'test/')
clean_path = os.path.join(clean, 'test/mix')
if os.path.isdir(clean_path):
shutil.rmtree(clean_path)
shutil.copytree(raw_path, clean_path)
return
def main(raw, clean, valid_size=1000, sample_size=100):
# in case your path contains ~/
raw = os.path.expanduser(raw)
clean = os.path.expanduser(clean)
test_path = os.path.join(clean, 'test')
training_path = os.path.join(raw, 'train')
dog_paths, cat_paths = get_dc_paths(training_path)
process_animal(raw, clean, 'dogs', dog_paths, valid_size, sample_size)
process_animal(raw, clean, 'cats', cat_paths, valid_size, sample_size)
process_test(raw, clean, test_path)
print 'All done!'
return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment