Skip to content

Instantly share code, notes, and snippets.

Last active Jun 29, 2017
What would you like to do?
Script to process dogs vs. cats data from Kaggle for use with VGG ImageNet submission. See
import os
import random
import glob
import shutil
def get_dc_paths(path):
"""Given a path filled with dog & cat files, make separate lists of both."""
cats = []
dogs = []
files = glob.glob('{}/*'.format(path))
for f in files:
if 'cat' in f:
if 'dog' in f:
return dogs, cats
def copy_files(raw_path, clean_path, files, animal, train_or_valid, sample=False):
"""Copy files from the raw path into your desired location.
raw_path: location of raw data as downloaded
clean_path: desired output location
files: list of files
animal: specify 'dog' or 'cat' - used for directory names
train_or_valid: specify 'train' or 'valid' - used for directory names
sample: boolean (True or False) - used for prepping sample data directory"""
if not sample:
# normal setup
tail = '{}/{}/'.format(train_or_valid, animal)
animal_path = os.path.join(clean_path, tail)
# sample directory setup - just add a 'sample' subdirectory
tail = '{}/{}/{}/'.format('sample', train_or_valid, animal)
animal_path = os.path.join(clean_path, tail)
if os.path.isdir(animal_path):
# this may not work the first time - delete all the copied files
# and start over
print 'Deleting previously copied \'{}\' files'.format(animal)
# create output directories and any parent directories
n = 0
# do the copy, keeping track of number of files for basic logging
for fname in files:
path = os.path.join(raw_path, fname)
shutil.copy(path, animal_path)
n += 1
print 'Copied {} {} files'.format(n, animal)
def process_animal(raw, clean, animal, paths, valid_size, sample_size):
# process dogs - randomize order of paths list to get started
# validation data is the first N paths
valid = paths[:valid_size]
# validation data is everything else
train = paths[valid_size:]
# do the copy of dogs for training and validation
copy_files(raw, clean, train, animal, 'train')
copy_files(raw, clean, valid, animal, 'valid')
# do the same thing for sample training data, making a validation
# data set 20% the size of the training sample size
print 'Sampling {}'.format(animal)
train = paths[:sample_size]
valid = paths[-int(sample_size * 0.2):]
copy_files(raw, clean, train, animal, 'train', sample=True)
copy_files(raw, clean, valid, animal, 'valid', sample=True)
def process_test(raw, clean, paths):
"""Copy test data into the right place for use with the VGG model."""
print 'Copying test data'
raw_path = os.path.join(raw, 'test/')
clean_path = os.path.join(clean, 'test/mix')
if os.path.isdir(clean_path):
shutil.copytree(raw_path, clean_path)
def main(raw, clean, valid_size=1000, sample_size=100):
# in case your path contains ~/
raw = os.path.expanduser(raw)
clean = os.path.expanduser(clean)
test_path = os.path.join(clean, 'test')
training_path = os.path.join(raw, 'train')
dog_paths, cat_paths = get_dc_paths(training_path)
process_animal(raw, clean, 'dogs', dog_paths, valid_size, sample_size)
process_animal(raw, clean, 'cats', cat_paths, valid_size, sample_size)
process_test(raw, clean, test_path)
print 'All done!'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment