ashishrana160796/BBBC_data_preprocess.py

## BBBC_data_preprocess.py
# Best way to run copy and paste portions into an ipython3 shell

# 1. Import Statements
import operator
from os import listdir
from os.path import isfile, join


# 2. Select files with matching tiff extensions.
import re
pattern = re.compile(".+_w1.TIF")


# 3. Currently ground truth images are loaded. For loading complete dataset uncomment the line below myath
mypath="BBBC005_v1_ground_truth/BBBC005_v1_ground_truth/"
# mypath="BBBC005_v1_images/"
# for 19,200 images dataset
file_names = [f for f in listdir(mypath) if (isfile(join(mypath, f)) and pattern.match(str(f)))]
# check length of file
len(file_names)


# 4. extract count from respective image names and store them in a list
file_count=[int(re.search('_C(.+?)_',f).group(1)) for f in file_names]


# 5. final dictionary data structure created.
file_dict=dict(zip(file_names, file_count))


# 6. get-max count, from the given images
max(file_dict.items(), key=operator.itemgetter(1))


# Important
# 7. change format to JPEG, reduce size to third of images that are loaded.

import os
from PIL import Image
# make sure following directory is not there before, otherwise statement will result in error
data_dest="data_set/"
os.mkdir(data_dest)
# looping for changing the file formats
for file_name in file_names:
    full_file_name = os.path.join(mypath, file_name)
    print (full_file_name)
    outfile = os.path.splitext(file_name)[0] + ".jpg"
    im = Image.open(full_file_name)
    print ("Generating jpeg for %s" % file_name)
    im.thumbnail((im.size[0]/3, im.size[1]/3), Image.ANTIALIAS)
    im.save(data_dest+outfile, "JPEG", quality=72)


# 8. split to test/train-75/25 ratio shuffled ratio and create test and train dataset directories respectively.
# shuffle dictionary with random shuffle of keys and access in that manner
import random
import re
new_pattern = re.compile(".+.jpg")
# similar script portion as above to load jpg images.
new_path="data_set/"
new_file_names = [f for f in listdir(new_path) if (isfile(join(new_path, f)) and new_pattern.match(str(f)))]
# randomness added.
random.shuffle(new_file_names)
# make directory where train & test dataset will get created.
# make sure following directory is not there before, otherwise statement will result in error
import shutil
train_dest="train_set/"
os.mkdir(train_dest)
test_dest="test_set/"
os.mkdir(test_dest)
# splitting into different datasets
split_point=0.75
i=0
for key in new_file_names:
    full_file_name = os.path.join(new_path, key)
    if i <= 0.75*len(new_file_names):
        shutil.copy(full_file_name, train_dest)
    else:
        shutil.copy(full_file_name, test_dest)
    i=i+1

# ----------------------------------------------------------------------------
# END
# ----------------------------------------------------------------------------


# 9. Extra preprocessing step for afterwards analysis, split on the basis of count of nucleis.
# Split to test/train-50/50 ratio with lesser count half for training and rest for testing.
# Only (1/2) the dataset for training: the lower half of counts of nucleis.
# The higher half of the counts for prediction.

# import os
# import shutil
# make sure following directory is not there before, otherwise statement will result in error
# train_dest="train_set/"
# os.mkdir(train_dest)
# test_dest="test_set/"
# os.mkdir(test_dest)

# for file_name in file_names:
#     full_file_name = os.path.join(mypath, file_name)
#     if (os.path.isfile(full_file_name) and file_dict.get(file_name)<=50):
#     # count being criteria of seperation, not number of entries
#         shutil.copy(full_file_name, train_dest)
#     else:
#         shutil.copy(full_file_name, test_dest)
	# Best way to run copy and paste portions into an ipython3 shell

	# 1. Import Statements
	import operator
	from os import listdir
	from os.path import isfile, join


	# 2. Select files with matching tiff extensions.
	import re
	pattern = re.compile(".+_w1.TIF")


	# 3. Currently ground truth images are loaded. For loading complete dataset uncomment the line below myath
	mypath="BBBC005_v1_ground_truth/BBBC005_v1_ground_truth/"
	# mypath="BBBC005_v1_images/"
	# for 19,200 images dataset
	file_names = [f for f in listdir(mypath) if (isfile(join(mypath, f)) and pattern.match(str(f)))]
	# check length of file
	len(file_names)


	# 4. extract count from respective image names and store them in a list
	file_count=[int(re.search('_C(.+?)_',f).group(1)) for f in file_names]


	# 5. final dictionary data structure created.
	file_dict=dict(zip(file_names, file_count))


	# 6. get-max count, from the given images
	max(file_dict.items(), key=operator.itemgetter(1))


	# Important
	# 7. change format to JPEG, reduce size to third of images that are loaded.

	import os
	from PIL import Image
	# make sure following directory is not there before, otherwise statement will result in error
	data_dest="data_set/"
	os.mkdir(data_dest)
	# looping for changing the file formats
	for file_name in file_names:
	full_file_name = os.path.join(mypath, file_name)
	print (full_file_name)
	outfile = os.path.splitext(file_name)[0] + ".jpg"
	im = Image.open(full_file_name)
	print ("Generating jpeg for %s" % file_name)
	im.thumbnail((im.size[0]/3, im.size[1]/3), Image.ANTIALIAS)
	im.save(data_dest+outfile, "JPEG", quality=72)


	# 8. split to test/train-75/25 ratio shuffled ratio and create test and train dataset directories respectively.
	# shuffle dictionary with random shuffle of keys and access in that manner
	import random
	import re
	new_pattern = re.compile(".+.jpg")
	# similar script portion as above to load jpg images.
	new_path="data_set/"
	new_file_names = [f for f in listdir(new_path) if (isfile(join(new_path, f)) and new_pattern.match(str(f)))]
	# randomness added.
	random.shuffle(new_file_names)
	# make directory where train & test dataset will get created.
	# make sure following directory is not there before, otherwise statement will result in error
	import shutil
	train_dest="train_set/"
	os.mkdir(train_dest)
	test_dest="test_set/"
	os.mkdir(test_dest)
	# splitting into different datasets
	split_point=0.75
	i=0
	for key in new_file_names:
	full_file_name = os.path.join(new_path, key)
	if i <= 0.75*len(new_file_names):
	shutil.copy(full_file_name, train_dest)
	else:
	shutil.copy(full_file_name, test_dest)
	i=i+1

	# ----------------------------------------------------------------------------
	# END
	# ----------------------------------------------------------------------------


	# 9. Extra preprocessing step for afterwards analysis, split on the basis of count of nucleis.
	# Split to test/train-50/50 ratio with lesser count half for training and rest for testing.
	# Only (1/2) the dataset for training: the lower half of counts of nucleis.
	# The higher half of the counts for prediction.

	# import os
	# import shutil
	# make sure following directory is not there before, otherwise statement will result in error
	# train_dest="train_set/"
	# os.mkdir(train_dest)
	# test_dest="test_set/"
	# os.mkdir(test_dest)

	# for file_name in file_names:
	# full_file_name = os.path.join(mypath, file_name)
	# if (os.path.isfile(full_file_name) and file_dict.get(file_name)<=50):
	# # count being criteria of seperation, not number of entries
	# shutil.copy(full_file_name, train_dest)
	# else:
	# shutil.copy(full_file_name, test_dest)