Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Split images randomly over train or validation folder
# Copyright 2014-2017 Bert Carremans
# Author: Bert Carremans <>
# License: BSD 3 clause
import os
import random
from shutil import copyfile
def img_train_test_split(img_source_dir, train_size):
Randomly splits images over a train and validation folder, while preserving the folder structure
img_source_dir : string
Path to the folder with the images to be split. Can be absolute or relative path
train_size : float
Proportion of the original images that need to be copied in the subdirectory in the train folder
if not (isinstance(img_source_dir, str)):
raise AttributeError('img_source_dir must be a string')
if not os.path.exists(img_source_dir):
raise OSError('img_source_dir does not exist')
if not (isinstance(train_size, float)):
raise AttributeError('train_size must be a float')
# Set up empty folder structure if not exists
if not os.path.exists('data'):
if not os.path.exists('data/train'):
if not os.path.exists('data/validation'):
# Get the subdirectories in the main image folder
subdirs = [subdir for subdir in os.listdir(img_source_dir) if os.path.isdir(os.path.join(img_source_dir, subdir))]
for subdir in subdirs:
subdir_fullpath = os.path.join(img_source_dir, subdir)
if len(os.listdir(subdir_fullpath)) == 0:
print(subdir_fullpath + ' is empty')
train_subdir = os.path.join('data/train', subdir)
validation_subdir = os.path.join('data/validation', subdir)
# Create subdirectories in train and validation folders
if not os.path.exists(train_subdir):
if not os.path.exists(validation_subdir):
train_counter = 0
validation_counter = 0
# Randomly assign an image to train or validation folder
for filename in os.listdir(subdir_fullpath):
if filename.endswith(".jpg") or filename.endswith(".png"):
fileparts = filename.split('.')
if random.uniform(0, 1) <= train_size:
copyfile(os.path.join(subdir_fullpath, filename), os.path.join(train_subdir, str(train_counter) + '.' + fileparts[1]))
train_counter += 1
copyfile(os.path.join(subdir_fullpath, filename), os.path.join(validation_subdir, str(validation_counter) + '.' + fileparts[1]))
validation_counter += 1
print('Copied ' + str(train_counter) + ' images to data/train/' + subdir)
print('Copied ' + str(validation_counter) + ' images to data/validation/' + subdir)

This comment has been minimized.

Copy link

@Anurag-Varma Anurag-Varma commented Jun 11, 2020

helpful man


This comment has been minimized.

Copy link

@jitendersaini jitendersaini commented Jul 3, 2020

Very nice work, but I can see some discrepancy in the result. I have 2 folders as cats and dogs and both have 12500 images in when I run this program with 0.8 train size its showing the following result:

Copied 10005 images to data/train/dogs
Copied 2495 images to data/validation/dogs
Copied 9955 images to data/train/cats
Copied 2545 images to data/validation/cats

dogs and cats train folder must be with 10,000 images each and validation one 2500 images.

Please check


This comment has been minimized.

Copy link

@masouduut94 masouduut94 commented Sep 12, 2020

Much better solution

!pip install split_folders
import splitfolders

or import split_folders

Split with a ratio.

To only split into training and validation set, set a tuple to ratio, i.e, (.8, .2).

splitfolders.ratio("train", output="output", seed=68, ratio=(0.8, 0.2, 0.0), group_prefix=None) # default values


This comment has been minimized.

Copy link

@mavaylon1 mavaylon1 commented Sep 15, 2020

I tries split ratio but it just says ratio is not an attribute.

import splitfolders

splitfolders.ratio("/Users/mavaylon/Research/Research_Gambier/Data_P/BP", output="/Users/mavaylon/Research/Research_Gambier/Data_P/output", seed=1337, ratio=(.7, .3), group_prefix=None) # default values

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.