Skip to content

Instantly share code, notes, and snippets.

@gitUmaru
Created September 19, 2020 21:21
Show Gist options
  • Save gitUmaru/0dba644ae7b53b5d055b43252ca29d93 to your computer and use it in GitHub Desktop.
Save gitUmaru/0dba644ae7b53b5d055b43252ca29d93 to your computer and use it in GitHub Desktop.
Split Data Into Train and Test --- This is a general purpose script for data processing and machine learning that takes a large folder of data (organized in folder name) and sorts them according to what TensorFlow Image generator can use.)
# @author: gitUmarx`
import os
import zipfile
import random
from shutil import copyfile
def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE):
"""
Function to split data from SOURCE into its respective TRAING and TESTING directories using the desired SPLIT_SIZE
@param: SOURCE - path to source dataset folder as String
TRAINING - path to training folder as String
TESTING - path to testing fodler as String
SPLIT_SIZE - percentage of source data that will become training as int
@output: void - Sorted file structure ready for tf.keras.preprocessing.image.ImageDataGenerator
"""
## shuffle your data and define desired size of train and test data
data = random.sample(os.listdir(SOURCE),len(os.listdir(SOURCE)))
SIZE = len(data)
for i in range(SIZE):
if(i < round(SPLIT_SIZE*SIZE)):
copyfile(SOURCE+data[i],TRAINING+data[i])
else:
copyfile(SOURCE+data[i],TESTING+data[i])
def make_dir(ITEM,CLASS1,CLASS2):
"""
Fuction that makes the desired file structure
@param: ITEM - name of root directory as a String
CLASS1 - name of first label as a String
ClASS2 - name of second label as a String
@output: void - New file structure (relateive to project folder)
"""
try:
os.mkdir(f'./{ITEM}/')
os.mkdir(f'./{ITEM}/training/')
os.mkdir(f'./{ITEM}/testing/')
os.mkdir(f'./{ITEM}/training/{CLASS1}/')
os.mkdir(f'./{ITEM}/training/{CLASS2}/')
os.mkdir(f'./{ITEM}/testing/{CLASS1}/')
os.mkdir(f'./{ITEM}/testing/{CLASS2}/')
except OSError:
pass
if __name__ == '__main__':
make_dir("cells","stroma","tumour")
STROMA_DIR = "./VOA-901B-002/Stroma/"
TRAINING_STROMA_DIR = "./cells/training/stroma/"
TESTING_STROMA_DIR = "./cells/testing/stroma/"
TUMOUR_DIR = "./VOA-901B-002/Tumor/"
TRAINING_TUMOUR_DIR = "./cells/training/tumour/"
TESTING_TUMOUR_DIR = "./cells/testing/tumour/"
split_size = 0.8
split_data(STROMA_DIR, TRAINING_STROMA_DIR, TESTING_STROMA_DIR, split_size)
split_data(TUMOUR_DIR, TRAINING_TUMOUR_DIR, TESTING_TUMOUR_DIR, split_size)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment