Skip to content

Instantly share code, notes, and snippets.

@ashishrana160796
Last active January 1, 2022 22:07
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashishrana160796/3462546c55e6e2dd71d9dd65c571bcb2 to your computer and use it in GitHub Desktop.
Save ashishrana160796/3462546c55e6e2dd71d9dd65c571bcb2 to your computer and use it in GitHub Desktop.
Load Images from BBBC dataset(Here, https://data.broadinstitute.org/bbbc/BBBC005/) and Resize them with PIL python library along with changes their size & for JPEG. After, that split them into test and train directories depending on requirement add conditions for test/train sets.
# Best way to run copy and paste portions into an ipython3 shell
# 1. Import Statements
import operator
from os import listdir
from os.path import isfile, join
# 2. Select files with matching tiff extensions.
import re
pattern = re.compile(".+_w1.TIF")
# 3. Currently ground truth images are loaded. For loading complete dataset uncomment the line below myath
mypath="BBBC005_v1_ground_truth/BBBC005_v1_ground_truth/"
# mypath="BBBC005_v1_images/"
# for 19,200 images dataset
file_names = [f for f in listdir(mypath) if (isfile(join(mypath, f)) and pattern.match(str(f)))]
# check length of file
len(file_names)
# 4. extract count from respective image names and store them in a list
file_count=[int(re.search('_C(.+?)_',f).group(1)) for f in file_names]
# 5. final dictionary data structure created.
file_dict=dict(zip(file_names, file_count))
# 6. get-max count, from the given images
max(file_dict.items(), key=operator.itemgetter(1))
# Important
# 7. change format to JPEG, reduce size to third of images that are loaded.
import os
from PIL import Image
# make sure following directory is not there before, otherwise statement will result in error
data_dest="data_set/"
os.mkdir(data_dest)
# looping for changing the file formats
for file_name in file_names:
full_file_name = os.path.join(mypath, file_name)
print (full_file_name)
outfile = os.path.splitext(file_name)[0] + ".jpg"
im = Image.open(full_file_name)
print ("Generating jpeg for %s" % file_name)
im.thumbnail((im.size[0]/3, im.size[1]/3), Image.ANTIALIAS)
im.save(data_dest+outfile, "JPEG", quality=72)
# 8. split to test/train-75/25 ratio shuffled ratio and create test and train dataset directories respectively.
# shuffle dictionary with random shuffle of keys and access in that manner
import random
import re
new_pattern = re.compile(".+.jpg")
# similar script portion as above to load jpg images.
new_path="data_set/"
new_file_names = [f for f in listdir(new_path) if (isfile(join(new_path, f)) and new_pattern.match(str(f)))]
# randomness added.
random.shuffle(new_file_names)
# make directory where train & test dataset will get created.
# make sure following directory is not there before, otherwise statement will result in error
import shutil
train_dest="train_set/"
os.mkdir(train_dest)
test_dest="test_set/"
os.mkdir(test_dest)
# splitting into different datasets
split_point=0.75
i=0
for key in new_file_names:
full_file_name = os.path.join(new_path, key)
if i <= 0.75*len(new_file_names):
shutil.copy(full_file_name, train_dest)
else:
shutil.copy(full_file_name, test_dest)
i=i+1
# ----------------------------------------------------------------------------
# END
# ----------------------------------------------------------------------------
# 9. Extra preprocessing step for afterwards analysis, split on the basis of count of nucleis.
# Split to test/train-50/50 ratio with lesser count half for training and rest for testing.
# Only (1/2) the dataset for training: the lower half of counts of nucleis.
# The higher half of the counts for prediction.
# import os
# import shutil
# make sure following directory is not there before, otherwise statement will result in error
# train_dest="train_set/"
# os.mkdir(train_dest)
# test_dest="test_set/"
# os.mkdir(test_dest)
# for file_name in file_names:
# full_file_name = os.path.join(mypath, file_name)
# if (os.path.isfile(full_file_name) and file_dict.get(file_name)<=50):
# # count being criteria of seperation, not number of entries
# shutil.copy(full_file_name, train_dest)
# else:
# shutil.copy(full_file_name, test_dest)
@ashishrana160796
Copy link
Author

Threads issue addressed with this gist:

Tiff to JPG conversion
Image resize and quality change
mkdir errors

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment