Karol-G/preprocessing.py

## preprocessing.py
from nilearn import plotting
import numpy as np
import nibabel as nib
import os
import matplotlib.pyplot as plt
from pathlib import Path
import scipy

class Preprocessor():

    def __init__(self, load_dir, save_dir, data_list):
        self.load_dir = load_dir
        self.save_dir = save_dir
        self.data_list = data_list
        self.data_list = np.asarray(data_list)
        self.plot_min = -2000
        self.plot_max = 1000
        self.hu_min = -1000
        self.hu_max = None

    def preprocess_dataset(self):
        self._load_correct_HU_filenames(self.load_dir)
        for (filename, air, fat) in self.data_list:
            img, affine, spacing = self._load_img(filename, print_header=False)
            self._plot_histogram(img, "Raw data")
            img = self._estimate_HU(img, air, fat)
            img = self._clip(img, a_min=self.hu_min, a_max=self.hu_max)
            self._plot_histogram(img, "HU estimated")
            img, new_spacing = self._resample(img, spacing)
            self._save_img(filename, img, np.eye(4), new_spacing)

    def _load_correct_HU_filenames(self, img_dir):
        valid_extensions = ('.nii.gz', '.nii')
        filenames = []

        # Load filenames
        for file in os.listdir(img_dir):
            if file.endswith(valid_extensions):
                occurences = file.count(".")
                for _ in range(occurences):
                    file = Path(file).stem
                if not np.isin(file, self.data_list[:, 0]):  # Add only filename if it is not already in data_list
                    filenames.append((file, None, None))
        filenames = np.asarray(filenames)

        self.data_list = np.append(self.data_list, filenames, axis=0)

    def _load_img(self, filename, print_header=True):
        self._plot_niigz(filename)
        img = nib.load(self.load_dir + filename + ".nii.gz")
        if print_header:
            print(img.header)
        affine = img.affine
        img_np = img.get_fdata()
        spacing = img.header["pixdim"][1:4]
        return img_np, affine, spacing

    def _estimate_HU(self, img, air, fat):
        if air is None:
            return img

        air_HU = -1000
        fat_HU = -100

        air = int(air)
        fat = int(fat)
        delta_air_fat_HU = abs(air_HU - fat_HU)
        delta_air = abs(air - air_HU)
        delta_fat_air_rgb = abs(fat - air)
        ratio = delta_air_fat_HU / delta_fat_air_rgb

        img = img - air
        img = img * ratio
        img = img + air_HU
        return img

    def _clip(self, img, a_min=None, a_max=None):
        return np.clip(img, a_min=a_min, a_max=a_max)

    def _resample(self, img, spacing, new_spacing=[1,1,1]):
        resize_factor = spacing / new_spacing
        new_real_shape = img.shape * resize_factor
        new_shape = np.round(new_real_shape)
        real_resize_factor = new_shape / img.shape
        new_spacing = spacing / real_resize_factor

        img = scipy.ndimage.interpolation.zoom(img, real_resize_factor)

        return img, new_spacing

    def _save_img(self, filename, img, affine=np.eye(4), new_spacing=None):
        img = nib.Nifti1Image(img, affine=affine)
        nib.save(img, self.save_dir + filename + ".nii.gz")
        self._plot_niigz(filename)

    def _plot_histogram(self, data, title):
        bin_size = 10
        fig, ax = plt.subplots()
        data_flat = data.flatten()
        _ = ax.hist(data_flat, bins=np.arange(self.plot_min, self.plot_max + bin_size, bin_size), density=True)
        plt.ylim(0, 0.02)
        plt.title(title)
        plt.show()

    def _plot_niigz(self, filename):
        plotting.plot_img(self.load_dir + filename + ".nii.gz", cmap='gray', vmin=self.plot_min, vmax=self.plot_max)
        plotting.show()

if __name__ == '__main__':
    # Every '.nii' or '.nii.gz' file in this directory will be preprocesed (HU estimation + resampling).
    _load_dir = "C:/Users/Karol/datasets/COVID-19_CT_Lung_and_Infection_Segmentation_Dataset/"
    _save_dir = "C:/Users/Karol/datasets/Dataset Preprocessed/"
    # Add any files from the _load_dir that need to get HU estimation. Every other file will only get resampled.
    _incorrect_HU_data_list = [("radiopaedia_4_85506_1", 59, 210),  # Filename, Air, Fat
                               ("radiopaedia_7_85703_0", 66, 208),
                               ("radiopaedia_10_85902_1", 59, 212),
                               ("radiopaedia_10_85902_3", 57, 215),
                               ("radiopaedia_14_85914_0", 37, 198),
                               ("radiopaedia_27_86410_0", 63, 212),
                               ("radiopaedia_29_86490_1", 59, 210),
                               ("radiopaedia_29_86491_1", 59, 208),
                               ("radiopaedia_36_86526_0", 30, 196),
                               ("radiopaedia_40_86625_0", 47, 197)]

    preprocessor = Preprocessor(_load_dir, _save_dir, _incorrect_HU_data_list)
    preprocessor.preprocess_dataset()
	from nilearn import plotting
	import numpy as np
	import nibabel as nib
	import os
	import matplotlib.pyplot as plt
	from pathlib import Path
	import scipy

	class Preprocessor():

	def __init__(self, load_dir, save_dir, data_list):
	self.load_dir = load_dir
	self.save_dir = save_dir
	self.data_list = data_list
	self.data_list = np.asarray(data_list)
	self.plot_min = -2000
	self.plot_max = 1000
	self.hu_min = -1000
	self.hu_max = None

	def preprocess_dataset(self):
	self._load_correct_HU_filenames(self.load_dir)
	for (filename, air, fat) in self.data_list:
	img, affine, spacing = self._load_img(filename, print_header=False)
	self._plot_histogram(img, "Raw data")
	img = self._estimate_HU(img, air, fat)
	img = self._clip(img, a_min=self.hu_min, a_max=self.hu_max)
	self._plot_histogram(img, "HU estimated")
	img, new_spacing = self._resample(img, spacing)
	self._save_img(filename, img, np.eye(4), new_spacing)

	def _load_correct_HU_filenames(self, img_dir):
	valid_extensions = ('.nii.gz', '.nii')
	filenames = []

	# Load filenames
	for file in os.listdir(img_dir):
	if file.endswith(valid_extensions):
	occurences = file.count(".")
	for _ in range(occurences):
	file = Path(file).stem
	if not np.isin(file, self.data_list[:, 0]): # Add only filename if it is not already in data_list
	filenames.append((file, None, None))
	filenames = np.asarray(filenames)

	self.data_list = np.append(self.data_list, filenames, axis=0)

	def _load_img(self, filename, print_header=True):
	self._plot_niigz(filename)
	img = nib.load(self.load_dir + filename + ".nii.gz")
	if print_header:
	print(img.header)
	affine = img.affine
	img_np = img.get_fdata()
	spacing = img.header["pixdim"][1:4]
	return img_np, affine, spacing

	def _estimate_HU(self, img, air, fat):
	if air is None:
	return img

	air_HU = -1000
	fat_HU = -100

	air = int(air)
	fat = int(fat)
	delta_air_fat_HU = abs(air_HU - fat_HU)
	delta_air = abs(air - air_HU)
	delta_fat_air_rgb = abs(fat - air)
	ratio = delta_air_fat_HU / delta_fat_air_rgb

	img = img - air
	img = img * ratio
	img = img + air_HU
	return img

	def _clip(self, img, a_min=None, a_max=None):
	return np.clip(img, a_min=a_min, a_max=a_max)

	def _resample(self, img, spacing, new_spacing=[1,1,1]):
	resize_factor = spacing / new_spacing
	new_real_shape = img.shape * resize_factor
	new_shape = np.round(new_real_shape)
	real_resize_factor = new_shape / img.shape
	new_spacing = spacing / real_resize_factor

	img = scipy.ndimage.interpolation.zoom(img, real_resize_factor)

	return img, new_spacing

	def _save_img(self, filename, img, affine=np.eye(4), new_spacing=None):
	img = nib.Nifti1Image(img, affine=affine)
	nib.save(img, self.save_dir + filename + ".nii.gz")
	self._plot_niigz(filename)

	def _plot_histogram(self, data, title):
	bin_size = 10
	fig, ax = plt.subplots()
	data_flat = data.flatten()
	_ = ax.hist(data_flat, bins=np.arange(self.plot_min, self.plot_max + bin_size, bin_size), density=True)
	plt.ylim(0, 0.02)
	plt.title(title)
	plt.show()

	def _plot_niigz(self, filename):
	plotting.plot_img(self.load_dir + filename + ".nii.gz", cmap='gray', vmin=self.plot_min, vmax=self.plot_max)
	plotting.show()

	if __name__ == '__main__':
	# Every '.nii' or '.nii.gz' file in this directory will be preprocesed (HU estimation + resampling).
	_load_dir = "C:/Users/Karol/datasets/COVID-19_CT_Lung_and_Infection_Segmentation_Dataset/"
	_save_dir = "C:/Users/Karol/datasets/Dataset Preprocessed/"
	# Add any files from the _load_dir that need to get HU estimation. Every other file will only get resampled.
	_incorrect_HU_data_list = [("radiopaedia_4_85506_1", 59, 210), # Filename, Air, Fat
	("radiopaedia_7_85703_0", 66, 208),
	("radiopaedia_10_85902_1", 59, 212),
	("radiopaedia_10_85902_3", 57, 215),
	("radiopaedia_14_85914_0", 37, 198),
	("radiopaedia_27_86410_0", 63, 212),
	("radiopaedia_29_86490_1", 59, 210),
	("radiopaedia_29_86491_1", 59, 208),
	("radiopaedia_36_86526_0", 30, 196),
	("radiopaedia_40_86625_0", 47, 197)]

	preprocessor = Preprocessor(_load_dir, _save_dir, _incorrect_HU_data_list)
	preprocessor.preprocess_dataset()