Skip to content

Instantly share code, notes, and snippets.

@prerakmody
Created November 24, 2022 13:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save prerakmody/9237b618c804ca9b99c1fd21e30de496 to your computer and use it in GitHub Desktop.
Save prerakmody/9237b618c804ca9b99c1fd21e30de496 to your computer and use it in GitHub Desktop.
Histopathology Image Reading
"""
CAMELYON 16 DATASET
- Whole Slide Images (WSI) containing histopathological information on breast cancer
1. Download
- To view the list of AWS
- Link: https://aws.amazon.com/marketplace/pp/prodview-exkvqrznup6vc?sr=0-1&ref_=beagle&applicationId=AWSMPContessa#resources
- Click on Resources on AWS --> View Resources
- Single Sample
- aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/images/tumor_032.tif ./raw/tumor_032.tif
- aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/masks/tumor_032_mask.tif ./raw/tumor_032_mask.tif
- aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/annotations/tumor_032.xml ./raw/tumor_032.xml
- Full dataset
- aws s3 cp --recursive --no-sign-request s3://camelyon-dataset/CAMELYON16/images/ ./raw/ # 700GB
- aws s3 cp --recursive --no-sign-request s3://camelyon-dataset/CAMELYON16/masks/ ./raw/ # 8.76GB
2. To view
- Download ASAP: https://github.com/computationalpathologygroup/ASAP/releases
- Ensure that it corresponds with your python version if you want to do programmatic access
- Make sure that the ASAP bin path (e.g. C:\Program Files\ASAP 2.1\bin) is either in your sys.path
"""
# Import ASAP lib first!
import sys
sys.path.append('C:\\Program Files\\ASAP 2.1\\bin')
import multiresolutionimageinterface as mir
reader = mir.MultiResolutionImageReader()
# Import public libs
import pdb
import tqdm
import time
import json
import shutil
import traceback
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
# Init keys - paths
DATASET_NAME = 'CAMELYON16'
DIRNAME_RAW = 'raw'
DIRNAME_TMP = '_tmp'
DIR_RAW = Path(__file__).parent.absolute().joinpath(DIRNAME_RAW)
DIR_TMP = Path(__file__).parent.absolute().joinpath(DIRNAME_TMP)
Path(DIR_TMP).mkdir(exist_ok=True, parents=True)
# Init keys - filenames
EXT_TIF = '.tif'
POSTFIX_MASK = 'mask.tif'
POSTFIX_INFO_JSON = 'info.json'
FILENAME_IMAGES = '{}_{:03d}.tif'
FILENAME_MASKS = '{}_{:03d}_mask.tif'
DESCRIPTOR_PATCH = '{}_{:03d}__Lvl{}__P{}x{}-pad{}__perc{:.2f}'
FILENAME_INFO = DESCRIPTOR_PATCH + '__info.json'
FILETYPE_TRAIN_NORMAL = 'normal'
FILETYPE_TRAIN_TUMOR = 'tumor'
FILETYPE_TEST = 'test'
# Init keys - classes
CLASS_BACKGROUND = 0
CLASS_NORMAL = 1
CLASS_TUMOR = 2
# Init keys - miscellaneous
KEY_POINTS = 'points'
KEY_POINTS_TOTAL = 'points_total'
KEY_POINTS_TISSUE_TOTAL = 'points_tissue_total'
KEY_POINTS_TUMOR_TOTAL = 'points_tumor_total'
KEY_POINTS_TISSUE = 'points_tissue'
KEY_POINTS_TUMOR = 'points_tumor'
KEY_TRAIN = 'train'
KEY_EVAL = 'eval'
KEY_PATIENT_ID = 'patient_id'
KEY_PATIENT_TYPE = 'patient_type'
KEY_PATIENT_LEVEL = 'patient_level'
KEY_PATIENT_MINTISSUE_PERC = 'patient_tissue_perc'
KEY_PATCH_WIDTH = 'patch_width'
KEY_PATCH_HEIGHT = 'patch_height'
KEY_PATCH_PAD = 'patch_pad'
KEY_MIN_TISSUE_PERC = 'min_tissue_perc'
KEY_SAVE_IMGS = 'save_imgs'
KEY_TOTAL_LEVELS = 'total_levels'
KEY_MAX_IMG_W = 'max_img_w'
KEY_MAX_IMG_H = 'max_img_h'
KEY_MODE = 'mode'
KEY_PATCHES_TRAIN = 'key_patches_train'
KEY_TUMOR_PERC_TRAIN = 'tumor_perc_train'
KEY_DATASET_TYPE = 'dataset_type'
KEY_DATASET_TRAIN = 'dataset_train'
KEY_DATASET_TEST = 'dataset_test'
######### coordinate convention
# | (0,0)
# | ----------------------> (w)
# |
# |
# |
# V (h)
def parse_patient(params):
"""
# Extract a level-based .json file on (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image)
"""
t0 = time.time()
try:
# Step 0 - Params
patient_id = params[KEY_PATIENT_ID]
patient_type = params[KEY_PATIENT_TYPE]
patient_level = params[KEY_PATIENT_LEVEL]
patch_width = params[KEY_PATCH_WIDTH]
patch_height = params[KEY_PATCH_HEIGHT]
patch_pad = params[KEY_PATCH_PAD]
total_patch_pixels = patch_width * patch_height
min_tissue_perc = params[KEY_MIN_TISSUE_PERC]
save_imgs = params[KEY_SAVE_IMGS]
# Step 1 - Read Data
path_img = Path(DIR_RAW).joinpath(FILENAME_IMAGES.format(patient_type, patient_id))
path_mask = Path(DIR_RAW).joinpath(FILENAME_MASKS.format(patient_type, patient_id))
path_exists = True
if Path(path_img).exists() and Path(path_mask).exists():
wsi_img = reader.open(str(path_img))
wsi_mask = reader.open(str(path_mask))
ds_factor = wsi_mask.getLevelDownsample(patient_level)
# Step 2 - Grid the WSI and save the (hmin, wmin) coords
img_max_w, img_max_h = wsi_img.getLevelDimensions(patient_level)
points_w, points_h = np.meshgrid(np.linspace(0, img_max_w, int(img_max_w//(patch_width - patch_pad))+1), np.linspace(0, img_max_h, int(img_max_h//(patch_height - patch_pad))+1)) # create a grid for ((patch_width - patch_pad)
points_w, points_h = points_w.astype(int), points_h.astype(int)
patches_total = len(points_w.flatten())
# Step 3 - Prep for saving
points_tissue = []
points_tumor = []
DIR_TMP_PATIENT = Path(DIR_TMP).joinpath((DESCRIPTOR_PATCH + '__N{}').format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc, patches_total))
Path(DIR_TMP_PATIENT).mkdir(exist_ok=True, parents=True)
with tqdm.tqdm(total=patches_total, leave=False, desc=' - [{}] '.format(FILENAME_IMAGES.format(patient_type, patient_id))) as pbar_patient:
for patch_id, (point_w, point_h) in enumerate(zip(points_w.flatten(), points_h.flatten())):
# Step 3.1 - Get mask patch
wsi_patch_mask = np.array(wsi_mask.getUCharPatch(int((point_w - patch_pad//2) * ds_factor), int((point_h - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))
# Step 3.2 - Check if mask patch contains > min_tissue_perc
wsi_patch_mask_tumor_bool = CLASS_TUMOR in wsi_patch_mask
wsi_patch_mask_tissueperc = np.count_nonzero(wsi_patch_mask) / total_patch_pixels
if wsi_patch_mask_tissueperc >= min_tissue_perc:
points_tissue.append([point_w, point_h])
if save_imgs:
tumor_str = ''
show_perc = 0.05
if wsi_patch_mask_tumor_bool:
tumor_str = '__Tumor'
show_perc = 1.0
if np.random.random() < show_perc:
wsi_patch_image = np.array(wsi_image.getUCharPatch(int((point_w - patch_pad//2) * ds_factor), int((point_h - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))
f,axarr = plt.subplots(1,2)
axarr[0].imshow(wsi_patch_image)
# axarr[1].imshow(wsi_patch_image)
op = axarr[1].imshow(wsi_patch_mask, cmap='magma', alpha=1.0, vmin=CLASS_BACKGROUND, vmax=CLASS_TUMOR)
plt.colorbar(op, ax=axarr.ravel().tolist())
plt.suptitle('Level={} \n (patch=({}, {})(pad={}) from img=({},{})) \n Clases=(Bgd={}, Normal={}, Tumor={})'.format(patient_level, patch_width, patch_height, patch_pad, img_max_w, img_max_h, CLASS_BACKGROUND, CLASS_NORMAL, CLASS_TUMOR))
plt.savefig(str(DIR_TMP_PATIENT.joinpath('{}-{:03d}__{:06d}-{:06d}__{:.3f}{}.png'.format(patient_type, patient_id, int(point_w), int(point_h), wsi_patch_mask_tissueperc, tumor_str))))
plt.close()
# print (' - [Lvl:{}][{}/{}] (point_w, point_h): {:06d}, {:06d} || perc: {:.3f}'.format(LEVEL, patch_id, patches_total, point_w, point_h, wsi_patch_mask_tissueperc))
if wsi_patch_mask_tumor_bool:
# print (' --- Tumor!')
points_tumor.append([point_w, point_h])
pbar_patient.update(1)
else:
print (' - \n [ERROR][parse_patient()] Path issues: patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
print (' -- path_img : ', path_img)
print (' -- path_mask: ', path_mask)
print ('')
path_exists = False
# Step 4 - Finalize
if path_exists:
points_tissue = np.sort(np.array(points_tissue), axis=1)
if len(points_tumor):
points_tumor = np.sort(np.array(points_tumor), axis=1)
else:
points_tumor = np.array(points_tumor)
if save_imgs:
print (' - Total Patches = ', patches_total)
print (' - Total patches(tissue) = ', points_tissue.shape)
print (' - Total patches(tumor) = ', points_tumor.shape)
print (' - Total time taken : {:.2f}'.format(time.time() - t0) )
if 1:
DIR_TMP_PATIENT2 = Path(DIR_TMP).joinpath((DESCRIPTOR_PATCH + '__N{}-{}-{}').format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc, patches_total, len(points_tissue), len(points_tumor)))
if Path(DIR_TMP_PATIENT2).exists():
shutil.rmtree(DIR_TMP_PATIENT2)
shutil.move(src=str(DIR_TMP_PATIENT), dst=str(DIR_TMP_PATIENT2))
# Step 5 - Save level-based .json containing (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image)
path_json = Path(DIR_RAW).joinpath(FILENAME_INFO.format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc))
data_json = {
KEY_PATIENT_TYPE : patient_type
, KEY_PATIENT_ID : patient_id
, KEY_PATIENT_LEVEL : patient_level
, KEY_TOTAL_LEVELS : wsi_img.getNumberOfLevels()
, KEY_MAX_IMG_W : img_max_w
, KEY_MAX_IMG_H : img_max_h
, KEY_PATCH_WIDTH : patch_width
, KEY_PATCH_HEIGHT : patch_height
, KEY_PATCH_PAD : patch_pad
, KEY_MIN_TISSUE_PERC : min_tissue_perc
, KEY_POINTS_TOTAL : patches_total
, KEY_POINTS_TISSUE_TOTAL : len(points_tissue)
, KEY_POINTS_TUMOR_TOTAL : len(points_tumor)
, KEY_POINTS_TISSUE : points_tissue.tolist()
, KEY_POINTS_TUMOR : points_tumor.tolist()
}
with open(str(path_json), 'w') as fp:
json.dump(data_json, fp, indent=4)
except:
print ('\n - [ERROR][parse_patient()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
traceback.print_exc()
print ('\n - [ERROR][parse_patient()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
def get_patient_patches(params):
try:
res = []
# Step 1 - Params - Patient
patient_id = params[KEY_PATIENT_ID]
patient_type = params[KEY_PATIENT_TYPE]
patient_level = params[KEY_PATIENT_LEVEL]
min_tissue_perc = params[KEY_MIN_TISSUE_PERC]
patch_width = params[KEY_PATCH_WIDTH]
patch_height = params[KEY_PATCH_HEIGHT]
patch_pad = params[KEY_PATCH_PAD]
mode = params[KEY_MODE] # [KEY_TRAIN, KEY_EVAL]
patches_train = params[KEY_PATCHES_TRAIN]
patches_perc_tumor_train = params[KEY_TUMOR_PERC_TRAIN]
# Step 2 - Read .json file
path_json = Path(DIR_RAW).joinpath(FILENAME_INFO.format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc))
if not Path(path_json).exists():
parse_patient(params)
if Path(path_json).exists():
with open(str(path_json), 'r') as fp:
json_data = json.load(fp)
# Step 2.2 - Extract vals from .json file
points_tissue = json_data[KEY_POINTS_TISSUE]
points_tumor = json_data[KEY_POINTS_TUMOR]
# Step 2.3 - Extract random patches if in training mode
if mode == KEY_TRAIN:
for _ in range(patches_train):
if len(points_tumor):
if np.random.random() < patches_perc_tumor_train:
idx = np.random.randint(0, len(points_tumor))
points = points_tumor[idx]
else:
idx = np.random.randint(0, len(points_tissue))
points = points_tissue[idx]
else:
idx = np.random.randint(0, len(points_tissue))
points = points_tissue[idx]
points = np.array(points) + np.random.randint(0,patch_pad,2)
res.append(points.tolist())
elif mode == KEY_EVAL:
res = points_tissue
else:
print (' - \n [ERROR][get_patient_patches()] Path issues: patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
print (' -- path_json : ', path_json)
print ('')
except:
print ('\n - [ERROR][get_patient_patches()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
traceback.print_exc()
print ('\n - [ERROR][get_patient_patches()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
return res
def generator(params):
try:
# Step 0 - Init
res = {}
# Step 1 - Get (wmin,hmin) for patient patches
# Step 1.1 - Get paths as per KEY_DATASET_TYPE
dataset_type = params[KEY_DATASET_TYPE]
patient_paths_imgs = [each for each in Path(DIR_RAW).glob('*') if POSTFIX_MASK not in each.parts[-1] and POSTFIX_INFO_JSON not in each.parts[-1]]
if dataset_type == KEY_TRAIN:
patient_paths_imgs = [each for each in patient_paths_imgs if (FILETYPE_TRAIN_NORMAL in Path(each).parts[-1] or FILETYPE_TRAIN_TUMOR in Path(each).parts[-1])]
else:
patient_paths_imgs = [each for each in patient_paths_imgs if (FILETYPE_TEST in Path(each).parts[-1])]
# Step 1.2 - Loop over the paths and get (wmin,hmin) for patches
with tqdm.tqdm(total=len(patient_paths_imgs)) as pbar:
for patient_path_img in patient_paths_imgs:
patient_id = int(Path(patient_path_img).parts[-1].split('_')[1].split(EXT_TIF)[0])
patient_type = Path(patient_path_img).parts[-1].split('_')[0]
params[KEY_PATIENT_ID] = patient_id
params[KEY_PATIENT_TYPE] = patient_type
res[FILENAME_IMAGES.format(patient_type, patient_id)] = {KEY_POINTS: get_patient_patches(params), KEY_PATIENT_ID: patient_id, KEY_PATIENT_TYPE: patient_type}
res[FILENAME_IMAGES.format(patient_type, patient_id)][KEY_POINTS_TOTAL] = len(res[FILENAME_IMAGES.format(patient_type, patient_id)][KEY_POINTS])
# Step 2 - Loop over the patch points
patch_pad = params[KEY_PATCH_PAD]
patch_width = params[KEY_PATCH_WIDTH]
patch_height = params[KEY_PATCH_HEIGHT]
patient_level = params[KEY_PATIENT_LEVEL]
SAMPLES_TOTAL = sum(patient_obj[KEY_POINTS_TOTAL] for patient_obj in res.values())
with tqdm.tqdm(total=SAMPLES_TOTAL) as pbar_generator:
for patient_key in res:
patient_id = res[patient_key][KEY_PATIENT_ID]
patient_type = res[patient_key][KEY_PATIENT_TYPE]
path_img = Path(DIR_RAW).joinpath(FILENAME_IMAGES.format(patient_type, patient_id))
wsi_img = reader.open(str(path_img))
path_mask = Path(DIR_RAW).joinpath(FILENAME_MASKS.format(patient_type, patient_id))
wsi_mask = reader.open(str(path_mask))
ds_factor = wsi_mask.getLevelDownsample(patient_level)
for point in res[patient_key][KEY_POINTS]:
wsi_patch_mask = np.array(wsi_mask.getUCharPatch(int((point[0] - patch_pad//2) * ds_factor), int((point[1] - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))
wsi_patch_img = np.array(wsi_img.getUCharPatch( int((point[0] - patch_pad//2) * ds_factor), int((point[1] - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))
pbar_generator.update(1)
yield(wsi_patch_img, wsi_patch_mask)
except:
print ('\n - [ERROR][generator()] ')
traceback.print_exc()
pdb.set_trace()
if __name__ == "__main__":
try:
if 1:
params = {
KEY_PATIENT_LEVEL : 2
, KEY_MIN_TISSUE_PERC: 0.1
, KEY_PATCH_WIDTH : 512
, KEY_PATCH_HEIGHT : 512
, KEY_PATCH_PAD : 32
, KEY_SAVE_IMGS : False
, KEY_DATASET_TYPE : KEY_DATASET_TRAIN # [KEY_DATASET_TRAIN->[KEY_TRAIN, KEY_EVAL], KEY_DATASET_TEST->[KEY_EVAL]]
, KEY_MODE : KEY_TRAIN # [KEY_TRAIN, KEY_EVAL] # in train we have 270 WSIs, in test we have 129WSIs
, KEY_PATCHES_TRAIN : 1000
, KEY_TUMOR_PERC_TRAIN : 0.5
}
# Step 1 - Extract a level-based .json containing (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image)
if 0:
patient_paths_imgs = [each for each in Path(DIR_RAW).glob('*') if POSTFIX_MASK not in each.parts[-1] and POSTFIX_INFO_JSON not in each.parts[-1]]
with tqdm.tqdm(total=len(patient_paths_imgs)) as pbar:
for patient_path_img in patient_paths_imgs:
params[KEY_PATIENT_ID] = int(Path(patient_path_img).parts[-1].split('_')[1].split(EXT_TIF)[0])
params[KEY_PATIENT_TYPE] = Path(patient_path_img).parts[-1].split('_')[0]
parse_patient(params)
pbar.update(1)
# Step 2 - Use the level-based .json file and loop over the samples to understand the speed
if 1:
for (X,Y) in generator(params):
# print (X.shape, Y.shape)
# pdb.set_trace()
pass
except:
print ('\n - [__main__] ')
traceback.print_exc()
pdb.set_trace()
pdb.set_trace()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment