Skip to content

Instantly share code, notes, and snippets.

Last active Nov 21, 2016
What would you like to do?
Data prep for Kaggle's State Farm competition
import os
from glob import glob
import shutil
import pandas as pd
def choose_subjects(df, count):
# group the frame by the subject in the image
subjects = df.groupby('subject')
# randomize groups, and pick a few (given by 'count' variable)
all_subjects = subjects.groups.keys()
picked = np.random.permutation(all_subjects)[:count]
subject_list = []
# append each group to the list
for p in picked:
return subject_list
def mk_class_dirs(basepath, start, end, prefix='c'):
for i in range(start, end + 1):
path = os.path.join(basepath, '{}{}'.format(prefix, i))
if not os.path.exists(path):
def revert_valid(path):
valid_path = os.path.join(path, 'valid')
glob_path = os.path.join(valid_path, '*', '*')
for source in glob(glob_path):
target = source.replace('valid', 'train')
#print('mv {} {}'.format(source, target))
os.rename(source, target)
if not glob(glob_path):
print 'No files left - finishing cleanup'
shutil.rmtree(os.path.join(path, 'valid'))
return True
return False
def mk_valid(path, subject_count=1, cleanup_only=False):
if os.path.exists(os.path.join(path, 'valid')):
if not revert_valid(path):
raise Exception('Not all previous files cleaned up properly')
if not cleanup_only:
# read the CSV file into a pandas DataFrame
df = pd.read_csv(path + 'driver_imgs_list.csv')
subjects = choose_subjects(df, subject_count)
# set up c0-c9 directory structure in valid folder
valid_path = os.path.join(path, 'valid')
mk_class_dirs(valid_path, 0, 9, 'c')
# loop over each subject and each image, moving images into
# the validation directory
for subject in subjects:
print 'Expect {} files to move'.format(subject.shape[0])
n = 0
for (subject, cls, img) in subject.values:
source = os.path.join(path, 'train', cls, img)
target = source.replace('train', 'valid')
# print('mv {} {}'.format(source, target))
os.rename(source, target)
n += 1
print 'Moved {} files'.format(n)
def copy_sample(random_paths, basepath, train_or_valid, maxidx, minidx=0):
sample = random_paths[minidx:maxidx]
n = 0
for f in sample:
path, fname = os.path.split(f)
img_class = os.path.split(path)[1]
if train_or_valid == 'valid':
source = os.path.join(basepath, 'train', img_class, fname)
source = os.path.join(basepath, train_or_valid, img_class, fname)
target = os.path.join(basepath, 'sample', train_or_valid, img_class, fname)
if n == 0:
parent = os.path.join(basepath, 'sample', train_or_valid)
mk_class_dirs(parent, 0, 9, 'c')
#print 'cp {} {}'.format(source, target)
shutil.copy(source, target)
n += 1
print 'Copied {} files to {} sample'.format(n, train_or_valid)
def mk_sample(path, train_size=200, valid_size=40):
sample_path = os.path.join(path, 'sample')
if os.path.exists(sample_path):
print('Deleting existing sample data')
train_path_sample = os.path.join(sample_path, 'train')
valid_path_sample = os.path.join(sample_path, 'valid')
glob_path = os.path.join(path, 'train', '*', '*')
raw_paths = glob(glob_path)
random_paths = np.random.permutation(raw_paths)
copy_sample(random_paths, path, 'train', train_size)
copy_sample(random_paths, path, 'valid', valid_size)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment