Created
September 19, 2020 21:21
-
-
Save gitUmaru/0dba644ae7b53b5d055b43252ca29d93 to your computer and use it in GitHub Desktop.
Split Data Into Train and Test --- This is a general purpose script for data processing and machine learning that takes a large folder of data (organized in folder name) and sorts them according to what TensorFlow Image generator can use.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# @author: gitUmarx` | |
import os | |
import zipfile | |
import random | |
from shutil import copyfile | |
def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE): | |
""" | |
Function to split data from SOURCE into its respective TRAING and TESTING directories using the desired SPLIT_SIZE | |
@param: SOURCE - path to source dataset folder as String | |
TRAINING - path to training folder as String | |
TESTING - path to testing fodler as String | |
SPLIT_SIZE - percentage of source data that will become training as int | |
@output: void - Sorted file structure ready for tf.keras.preprocessing.image.ImageDataGenerator | |
""" | |
## shuffle your data and define desired size of train and test data | |
data = random.sample(os.listdir(SOURCE),len(os.listdir(SOURCE))) | |
SIZE = len(data) | |
for i in range(SIZE): | |
if(i < round(SPLIT_SIZE*SIZE)): | |
copyfile(SOURCE+data[i],TRAINING+data[i]) | |
else: | |
copyfile(SOURCE+data[i],TESTING+data[i]) | |
def make_dir(ITEM,CLASS1,CLASS2): | |
""" | |
Fuction that makes the desired file structure | |
@param: ITEM - name of root directory as a String | |
CLASS1 - name of first label as a String | |
ClASS2 - name of second label as a String | |
@output: void - New file structure (relateive to project folder) | |
""" | |
try: | |
os.mkdir(f'./{ITEM}/') | |
os.mkdir(f'./{ITEM}/training/') | |
os.mkdir(f'./{ITEM}/testing/') | |
os.mkdir(f'./{ITEM}/training/{CLASS1}/') | |
os.mkdir(f'./{ITEM}/training/{CLASS2}/') | |
os.mkdir(f'./{ITEM}/testing/{CLASS1}/') | |
os.mkdir(f'./{ITEM}/testing/{CLASS2}/') | |
except OSError: | |
pass | |
if __name__ == '__main__': | |
make_dir("cells","stroma","tumour") | |
STROMA_DIR = "./VOA-901B-002/Stroma/" | |
TRAINING_STROMA_DIR = "./cells/training/stroma/" | |
TESTING_STROMA_DIR = "./cells/testing/stroma/" | |
TUMOUR_DIR = "./VOA-901B-002/Tumor/" | |
TRAINING_TUMOUR_DIR = "./cells/training/tumour/" | |
TESTING_TUMOUR_DIR = "./cells/testing/tumour/" | |
split_size = 0.8 | |
split_data(STROMA_DIR, TRAINING_STROMA_DIR, TESTING_STROMA_DIR, split_size) | |
split_data(TUMOUR_DIR, TRAINING_TUMOUR_DIR, TESTING_TUMOUR_DIR, split_size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment