Created
December 17, 2018 19:54
-
-
Save sariabod/56aeea96abe58671375eb52a883e356e to your computer and use it in GitHub Desktop.
Break up training set into train/valid splits.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import pathlib | |
import sys | |
import uuid | |
import shutil | |
df = pd.read_csv('data/train.csv') | |
f = open("data/labels.csv","a") | |
#let make dir structure | |
for label in df['Id'].unique(): | |
newdf = df[df['Id']==label] | |
uf = str(uuid.uuid4()) | |
# split into new train/val dataframes | |
train = newdf.sample(frac=0.8) | |
valid = newdf.drop(train.index) | |
#skip validatation cats with 1 or less images, causing issue in dataloader | |
if len(valid) > 1: | |
path = "data/valid/{}".format(label) | |
pathlib.Path(path).mkdir(parents=True, exist_ok=True) | |
for k, v in valid.iterrows(): | |
file = v[0] | |
label = v[1] | |
# copy the file from raw to appropriate directories (you can move is space is an issue) | |
shutil.copy("data/raw/{}".format(file), "data/valid/{}/{}".format(label,file)) | |
f.write("valid/{}/{},{}\n".format(label,file,label)) | |
# save pickles for later reference if needed | |
valid.to_pickle("data/pickles/valid_{}".format(uf)) | |
else: | |
# set all images to training set | |
train = newdf | |
if len(train) > 0: | |
path = "data/train/{}".format(label) | |
pathlib.Path(path).mkdir(parents=True, exist_ok=True) | |
for k, v in train.iterrows(): | |
file = v[0] | |
label = v[1] | |
# copy the file from raw to appropriate directories (you can move is space is an issue) | |
shutil.copy("data/raw/{}".format(file), "data/train/{}/{}".format(label,file)) | |
f.write("train/{}/{},{}\n".format(label,file,label)) | |
# save pickles for later reference if needed | |
train.to_pickle("data/pickles/train_{}".format(uf)) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment