Skip to content

Instantly share code, notes, and snippets.

@sariabod
Created December 17, 2018 19:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sariabod/56aeea96abe58671375eb52a883e356e to your computer and use it in GitHub Desktop.
Save sariabod/56aeea96abe58671375eb52a883e356e to your computer and use it in GitHub Desktop.
Break up training set into train/valid splits.
import pandas as pd
import pathlib
import sys
import uuid
import shutil
df = pd.read_csv('data/train.csv')
f = open("data/labels.csv","a")
#let make dir structure
for label in df['Id'].unique():
newdf = df[df['Id']==label]
uf = str(uuid.uuid4())
# split into new train/val dataframes
train = newdf.sample(frac=0.8)
valid = newdf.drop(train.index)
#skip validatation cats with 1 or less images, causing issue in dataloader
if len(valid) > 1:
path = "data/valid/{}".format(label)
pathlib.Path(path).mkdir(parents=True, exist_ok=True)
for k, v in valid.iterrows():
file = v[0]
label = v[1]
# copy the file from raw to appropriate directories (you can move is space is an issue)
shutil.copy("data/raw/{}".format(file), "data/valid/{}/{}".format(label,file))
f.write("valid/{}/{},{}\n".format(label,file,label))
# save pickles for later reference if needed
valid.to_pickle("data/pickles/valid_{}".format(uf))
else:
# set all images to training set
train = newdf
if len(train) > 0:
path = "data/train/{}".format(label)
pathlib.Path(path).mkdir(parents=True, exist_ok=True)
for k, v in train.iterrows():
file = v[0]
label = v[1]
# copy the file from raw to appropriate directories (you can move is space is an issue)
shutil.copy("data/raw/{}".format(file), "data/train/{}/{}".format(label,file))
f.write("train/{}/{},{}\n".format(label,file,label))
# save pickles for later reference if needed
train.to_pickle("data/pickles/train_{}".format(uf))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment