Last active
March 12, 2021 08:38
-
-
Save Abhishek-Shaw-Kolkata/711a31f28eaf648d592cddcf90b92dce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_metadata_from_images(file_path): | |
''' | |
Extracts metadata present in DICOM file | |
Args: | |
file : DICOM file path | |
Returns: | |
a dictionary containing important metadata | |
''' | |
dataset = pydicom.dcmread(file_path) | |
d = {} | |
d['ImageID'] = os.path.splitext(file_path.split('/')[-1])[0] # Exculding the format (.dcm) | |
d['patientID'] = dataset.PatientID | |
d['age'] = dataset.PatientAge | |
d['sex'] = dataset.PatientSex | |
d['view_position'] = dataset.ViewPosition | |
d['pixel_spacing'] = dataset.PixelSpacing | |
d["modality"] = dataset.Modality | |
d["body_part_examined"] = dataset.BodyPartExamined | |
d['pixel_mean'] = np.mean(dataset.pixel_array) | |
d['pixel_min'] = np.min(dataset.pixel_array) | |
d['pixel_max'] = np.max(dataset.pixel_array) | |
return d | |
start = datetime.now() | |
num_cores = mp.cpu_count() | |
pool = Pool(num_cores) | |
results = pool.map(extract_metadata_from_images,train_files) | |
pool.close() | |
print("Total time taken {0}".format(datetime.now() - start)) | |
df_meta = pd.DataFrame(results) | |
# Saving it for later use | |
df_meta.to_pickle('df_meta.pkl') | |
df_meta = pd.read_pickle('df_meta.pkl') | |
# Merging the class label file with extracted metadata | |
df_meta = df_meta.merge(class_df , left_on = 'ImageID',right_on='ImageId' ,how='left') | |
nan_rows = df_meta[df_meta.ImageId.isna()] | |
print('There are {0} rows that does not have Class label'.format(nan_rows.shape[0])) | |
# Dropping nan rows | |
df_meta.dropna(subset = ['ImageId'],inplace= True) | |
# Drop duplicates | |
df_meta.drop_duplicates(subset = ['ImageId'],inplace=True) | |
# Adding a column to differentiate rows with and without Pneumothorax | |
df_meta['class_'] = 'Pneumothorax' | |
df_meta.loc[df_meta['EncodedPixels'] == "-1",'class_'] = 'NotPneumothorax' | |
# Casting ages as int | |
df_meta['age'] = df_meta['age'].astype(int) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment