prerakmody/camelyon16.py

## camelyon16.py
"""
CAMELYON 16 DATASET
- Whole Slide Images (WSI) containing histopathological information on breast cancer

1. Download
 - To view the list of AWS
    - Link: https://aws.amazon.com/marketplace/pp/prodview-exkvqrznup6vc?sr=0-1&ref_=beagle&applicationId=AWSMPContessa#resources
    - Click on Resources on AWS --> View Resources
 - Single Sample
   - aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/images/tumor_032.tif ./raw/tumor_032.tif
   - aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/masks/tumor_032_mask.tif ./raw/tumor_032_mask.tif
   - aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/annotations/tumor_032.xml ./raw/tumor_032.xml
 - Full dataset
   - aws s3 cp --recursive --no-sign-request s3://camelyon-dataset/CAMELYON16/images/ ./raw/ # 700GB
   - aws s3 cp --recursive --no-sign-request s3://camelyon-dataset/CAMELYON16/masks/ ./raw/  # 8.76GB

2. To view
 - Download ASAP: https://github.com/computationalpathologygroup/ASAP/releases
 - Ensure that it corresponds with your python version if you want to do programmatic access
 - Make sure that the ASAP bin path (e.g. C:\Program Files\ASAP 2.1\bin) is either in your sys.path
"""

# Import ASAP lib first!
import sys
sys.path.append('C:\\Program Files\\ASAP 2.1\\bin')
import multiresolutionimageinterface as mir
reader = mir.MultiResolutionImageReader()

# Import public libs
import pdb
import tqdm
import time
import json
import shutil
import traceback
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

# Init keys - paths
DATASET_NAME = 'CAMELYON16'
DIRNAME_RAW  = 'raw'
DIRNAME_TMP  = '_tmp'
DIR_RAW = Path(__file__).parent.absolute().joinpath(DIRNAME_RAW)
DIR_TMP = Path(__file__).parent.absolute().joinpath(DIRNAME_TMP)
Path(DIR_TMP).mkdir(exist_ok=True, parents=True)

# Init keys - filenames
EXT_TIF          = '.tif'
POSTFIX_MASK     = 'mask.tif'
POSTFIX_INFO_JSON = 'info.json'
FILENAME_IMAGES  = '{}_{:03d}.tif'
FILENAME_MASKS   = '{}_{:03d}_mask.tif'
DESCRIPTOR_PATCH = '{}_{:03d}__Lvl{}__P{}x{}-pad{}__perc{:.2f}'
FILENAME_INFO    = DESCRIPTOR_PATCH + '__info.json'
FILETYPE_TRAIN_NORMAL = 'normal'
FILETYPE_TRAIN_TUMOR  = 'tumor'
FILETYPE_TEST         = 'test'

# Init keys - classes
CLASS_BACKGROUND = 0
CLASS_NORMAL = 1
CLASS_TUMOR  = 2

# Init keys - miscellaneous
KEY_POINTS                 = 'points'
KEY_POINTS_TOTAL           = 'points_total'
KEY_POINTS_TISSUE_TOTAL    = 'points_tissue_total'
KEY_POINTS_TUMOR_TOTAL     = 'points_tumor_total'
KEY_POINTS_TISSUE          = 'points_tissue'
KEY_POINTS_TUMOR           = 'points_tumor'
KEY_TRAIN                  = 'train'
KEY_EVAL                   = 'eval'
KEY_PATIENT_ID             = 'patient_id'
KEY_PATIENT_TYPE           = 'patient_type'
KEY_PATIENT_LEVEL          = 'patient_level'
KEY_PATIENT_MINTISSUE_PERC = 'patient_tissue_perc'
KEY_PATCH_WIDTH            = 'patch_width'
KEY_PATCH_HEIGHT           = 'patch_height'
KEY_PATCH_PAD              = 'patch_pad'
KEY_MIN_TISSUE_PERC        = 'min_tissue_perc'
KEY_SAVE_IMGS              = 'save_imgs'
KEY_TOTAL_LEVELS           = 'total_levels'
KEY_MAX_IMG_W              = 'max_img_w'
KEY_MAX_IMG_H              = 'max_img_h'
KEY_MODE                   = 'mode'
KEY_PATCHES_TRAIN          = 'key_patches_train'
KEY_TUMOR_PERC_TRAIN       = 'tumor_perc_train'
KEY_DATASET_TYPE           = 'dataset_type'
KEY_DATASET_TRAIN          = 'dataset_train'
KEY_DATASET_TEST           = 'dataset_test'

######### coordinate convention
# | (0,0)
# | ----------------------> (w)
# |
# |
# |
# V (h)


def parse_patient(params):
    """
    # Extract a level-based .json file on (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image)
    """

    t0 = time.time()

    try:

        # Step 0 - Params
        patient_id    = params[KEY_PATIENT_ID]
        patient_type  = params[KEY_PATIENT_TYPE]
        patient_level = params[KEY_PATIENT_LEVEL]

        patch_width   = params[KEY_PATCH_WIDTH]
        patch_height  = params[KEY_PATCH_HEIGHT]
        patch_pad     =  params[KEY_PATCH_PAD]
        total_patch_pixels = patch_width * patch_height

        min_tissue_perc = params[KEY_MIN_TISSUE_PERC]

        save_imgs = params[KEY_SAVE_IMGS]

        # Step 1 - Read Data
        path_img  = Path(DIR_RAW).joinpath(FILENAME_IMAGES.format(patient_type, patient_id))
        path_mask = Path(DIR_RAW).joinpath(FILENAME_MASKS.format(patient_type, patient_id))

        path_exists = True
        if Path(path_img).exists() and Path(path_mask).exists():
            wsi_img   = reader.open(str(path_img))
            wsi_mask  = reader.open(str(path_mask))
            ds_factor = wsi_mask.getLevelDownsample(patient_level)

            # Step 2 - Grid the WSI and save the (hmin, wmin) coords
            img_max_w, img_max_h = wsi_img.getLevelDimensions(patient_level)
            points_w, points_h   = np.meshgrid(np.linspace(0, img_max_w, int(img_max_w//(patch_width - patch_pad))+1), np.linspace(0, img_max_h, int(img_max_h//(patch_height - patch_pad))+1)) # create a grid for ((patch_width - patch_pad)
            points_w, points_h   = points_w.astype(int), points_h.astype(int)
            patches_total        = len(points_w.flatten())

            # Step 3 - Prep for saving
            points_tissue        = []
            points_tumor         = []
            DIR_TMP_PATIENT = Path(DIR_TMP).joinpath((DESCRIPTOR_PATCH + '__N{}').format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc, patches_total))
            Path(DIR_TMP_PATIENT).mkdir(exist_ok=True, parents=True)

            with tqdm.tqdm(total=patches_total, leave=False, desc='  - [{}] '.format(FILENAME_IMAGES.format(patient_type, patient_id))) as pbar_patient:
                for patch_id, (point_w, point_h) in enumerate(zip(points_w.flatten(), points_h.flatten())):

                    # Step 3.1 - Get mask patch
                    wsi_patch_mask  = np.array(wsi_mask.getUCharPatch(int((point_w - patch_pad//2) * ds_factor), int((point_h - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))

                    # Step 3.2 - Check if mask patch contains > min_tissue_perc
                    wsi_patch_mask_tumor_bool = CLASS_TUMOR in wsi_patch_mask
                    wsi_patch_mask_tissueperc = np.count_nonzero(wsi_patch_mask) / total_patch_pixels
                    if wsi_patch_mask_tissueperc >= min_tissue_perc:
                        points_tissue.append([point_w, point_h])

                        if save_imgs:
                            tumor_str = ''
                            show_perc = 0.05
                            if wsi_patch_mask_tumor_bool:
                                tumor_str = '__Tumor'
                                show_perc = 1.0

                            if np.random.random() < show_perc:
                                wsi_patch_image = np.array(wsi_image.getUCharPatch(int((point_w - patch_pad//2) * ds_factor), int((point_h - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))
                                f,axarr = plt.subplots(1,2)
                                axarr[0].imshow(wsi_patch_image)
                                # axarr[1].imshow(wsi_patch_image)
                                op = axarr[1].imshow(wsi_patch_mask, cmap='magma', alpha=1.0, vmin=CLASS_BACKGROUND, vmax=CLASS_TUMOR)
                                plt.colorbar(op, ax=axarr.ravel().tolist())
                                plt.suptitle('Level={} \n (patch=({}, {})(pad={}) from img=({},{})) \n Clases=(Bgd={}, Normal={}, Tumor={})'.format(patient_level, patch_width, patch_height, patch_pad, img_max_w, img_max_h, CLASS_BACKGROUND, CLASS_NORMAL, CLASS_TUMOR))

                                plt.savefig(str(DIR_TMP_PATIENT.joinpath('{}-{:03d}__{:06d}-{:06d}__{:.3f}{}.png'.format(patient_type, patient_id, int(point_w), int(point_h), wsi_patch_mask_tissueperc, tumor_str))))
                                plt.close()

                    # print (' - [Lvl:{}][{}/{}] (point_w, point_h): {:06d}, {:06d} || perc: {:.3f}'.format(LEVEL, patch_id, patches_total, point_w, point_h, wsi_patch_mask_tissueperc))
                    if wsi_patch_mask_tumor_bool:
                        # print ('   --- Tumor!')
                        points_tumor.append([point_w, point_h])

                    pbar_patient.update(1)
        else:
            print (' - \n [ERROR][parse_patient()] Path issues: patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
            print ('    -- path_img : ', path_img)
            print ('    -- path_mask: ', path_mask)
            print ('')
            path_exists = False

        # Step 4 - Finalize
        if path_exists:
            points_tissue = np.sort(np.array(points_tissue), axis=1)
            if len(points_tumor):
                points_tumor  = np.sort(np.array(points_tumor), axis=1)
            else:
                points_tumor = np.array(points_tumor)
            if save_imgs:
                print (' - Total Patches         = ', patches_total)
                print (' - Total patches(tissue) = ', points_tissue.shape)
                print (' - Total patches(tumor)  = ', points_tumor.shape)
                print (' - Total time taken      : {:.2f}'.format(time.time() - t0) )

            if 1:
                DIR_TMP_PATIENT2 = Path(DIR_TMP).joinpath((DESCRIPTOR_PATCH + '__N{}-{}-{}').format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc, patches_total, len(points_tissue), len(points_tumor)))
                if Path(DIR_TMP_PATIENT2).exists():
                    shutil.rmtree(DIR_TMP_PATIENT2)
                shutil.move(src=str(DIR_TMP_PATIENT), dst=str(DIR_TMP_PATIENT2))

            # Step 5 - Save level-based .json containing (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image)
            path_json = Path(DIR_RAW).joinpath(FILENAME_INFO.format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc))
            data_json = {
                KEY_PATIENT_TYPE     : patient_type
                , KEY_PATIENT_ID     : patient_id
                , KEY_PATIENT_LEVEL  : patient_level
                , KEY_TOTAL_LEVELS   : wsi_img.getNumberOfLevels()
                , KEY_MAX_IMG_W      : img_max_w
                , KEY_MAX_IMG_H      : img_max_h
                , KEY_PATCH_WIDTH    : patch_width
                , KEY_PATCH_HEIGHT   : patch_height
                , KEY_PATCH_PAD      : patch_pad
                , KEY_MIN_TISSUE_PERC     : min_tissue_perc
                , KEY_POINTS_TOTAL        : patches_total
                , KEY_POINTS_TISSUE_TOTAL : len(points_tissue)
                , KEY_POINTS_TUMOR_TOTAL  : len(points_tumor)
                , KEY_POINTS_TISSUE       : points_tissue.tolist()
                , KEY_POINTS_TUMOR        : points_tumor.tolist()
            }
            with open(str(path_json), 'w') as fp:
                json.dump(data_json, fp, indent=4)

    except:
        print ('\n - [ERROR][parse_patient()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
        traceback.print_exc()
        print ('\n - [ERROR][parse_patient()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))

def get_patient_patches(params):

    try:

        res = []

        # Step 1 - Params - Patient
        patient_id      = params[KEY_PATIENT_ID]
        patient_type    = params[KEY_PATIENT_TYPE]
        patient_level   = params[KEY_PATIENT_LEVEL]
        min_tissue_perc = params[KEY_MIN_TISSUE_PERC]

        patch_width   = params[KEY_PATCH_WIDTH]
        patch_height  = params[KEY_PATCH_HEIGHT]
        patch_pad     =  params[KEY_PATCH_PAD]

        mode                     = params[KEY_MODE] # [KEY_TRAIN, KEY_EVAL]
        patches_train            = params[KEY_PATCHES_TRAIN]
        patches_perc_tumor_train = params[KEY_TUMOR_PERC_TRAIN]


        # Step 2 - Read .json file
        path_json = Path(DIR_RAW).joinpath(FILENAME_INFO.format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc))
        if not Path(path_json).exists():
            parse_patient(params)

        if Path(path_json).exists():
            with open(str(path_json), 'r') as fp:
                json_data = json.load(fp)

            # Step 2.2 - Extract vals from .json file
            points_tissue = json_data[KEY_POINTS_TISSUE]
            points_tumor  = json_data[KEY_POINTS_TUMOR]

            # Step 2.3 - Extract random patches if in training mode
            if mode == KEY_TRAIN:

                for _ in range(patches_train):

                    if len(points_tumor):
                        if np.random.random() < patches_perc_tumor_train:
                            idx = np.random.randint(0, len(points_tumor))
                            points = points_tumor[idx]
                        else:
                            idx    = np.random.randint(0, len(points_tissue))
                            points = points_tissue[idx]
                    else:
                        idx    = np.random.randint(0, len(points_tissue))
                        points = points_tissue[idx]

                    points = np.array(points) + np.random.randint(0,patch_pad,2)
                    res.append(points.tolist())

            elif mode == KEY_EVAL:
                res = points_tissue

        else:
            print (' - \n [ERROR][get_patient_patches()] Path issues: patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
            print ('    -- path_json : ', path_json)
            print ('')

    except:
        print ('\n - [ERROR][get_patient_patches()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))
        traceback.print_exc()
        print ('\n - [ERROR][get_patient_patches()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id))

    return res

def generator(params):

    try:

        # Step 0 - Init
        res = {}

        # Step 1 - Get (wmin,hmin) for patient patches
        # Step 1.1 - Get paths as per KEY_DATASET_TYPE
        dataset_type = params[KEY_DATASET_TYPE]
        patient_paths_imgs = [each for each in Path(DIR_RAW).glob('*') if POSTFIX_MASK not in each.parts[-1] and POSTFIX_INFO_JSON not in each.parts[-1]]
        if dataset_type == KEY_TRAIN:
            patient_paths_imgs = [each for each in patient_paths_imgs if (FILETYPE_TRAIN_NORMAL in Path(each).parts[-1] or FILETYPE_TRAIN_TUMOR in Path(each).parts[-1])]
        else:
            patient_paths_imgs = [each for each in patient_paths_imgs if (FILETYPE_TEST in Path(each).parts[-1])]

        # Step 1.2 - Loop over the paths and get (wmin,hmin) for patches
        with tqdm.tqdm(total=len(patient_paths_imgs)) as pbar:
            for patient_path_img in patient_paths_imgs:
                patient_id = int(Path(patient_path_img).parts[-1].split('_')[1].split(EXT_TIF)[0])
                patient_type = Path(patient_path_img).parts[-1].split('_')[0]
                params[KEY_PATIENT_ID]   = patient_id
                params[KEY_PATIENT_TYPE] = patient_type

                res[FILENAME_IMAGES.format(patient_type, patient_id)] = {KEY_POINTS: get_patient_patches(params), KEY_PATIENT_ID: patient_id, KEY_PATIENT_TYPE: patient_type}
                res[FILENAME_IMAGES.format(patient_type, patient_id)][KEY_POINTS_TOTAL] = len(res[FILENAME_IMAGES.format(patient_type, patient_id)][KEY_POINTS])

        # Step 2 - Loop over the patch points
        patch_pad     = params[KEY_PATCH_PAD]
        patch_width   = params[KEY_PATCH_WIDTH]
        patch_height  = params[KEY_PATCH_HEIGHT]
        patient_level = params[KEY_PATIENT_LEVEL]

        SAMPLES_TOTAL = sum(patient_obj[KEY_POINTS_TOTAL] for patient_obj in res.values())

        with tqdm.tqdm(total=SAMPLES_TOTAL) as pbar_generator:
            for patient_key in res:
                patient_id   = res[patient_key][KEY_PATIENT_ID]
                patient_type = res[patient_key][KEY_PATIENT_TYPE]

                path_img  = Path(DIR_RAW).joinpath(FILENAME_IMAGES.format(patient_type, patient_id))
                wsi_img   = reader.open(str(path_img))
                path_mask = Path(DIR_RAW).joinpath(FILENAME_MASKS.format(patient_type, patient_id))
                wsi_mask  = reader.open(str(path_mask))
                ds_factor = wsi_mask.getLevelDownsample(patient_level)

                for point in res[patient_key][KEY_POINTS]:

                    wsi_patch_mask  = np.array(wsi_mask.getUCharPatch(int((point[0] - patch_pad//2) * ds_factor), int((point[1] - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))
                    wsi_patch_img   = np.array(wsi_img.getUCharPatch( int((point[0] - patch_pad//2) * ds_factor), int((point[1] - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))

                    pbar_generator.update(1)
                    yield(wsi_patch_img, wsi_patch_mask)

    except:
        print ('\n - [ERROR][generator()] ')
        traceback.print_exc()
        pdb.set_trace()

if __name__ == "__main__":

    try:

        if 1:
            params = {
                KEY_PATIENT_LEVEL  : 2
                , KEY_MIN_TISSUE_PERC: 0.1
                , KEY_PATCH_WIDTH   : 512
                , KEY_PATCH_HEIGHT  : 512
                , KEY_PATCH_PAD     : 32
                , KEY_SAVE_IMGS     : False
                , KEY_DATASET_TYPE     : KEY_DATASET_TRAIN # [KEY_DATASET_TRAIN->[KEY_TRAIN, KEY_EVAL], KEY_DATASET_TEST->[KEY_EVAL]]
                , KEY_MODE             : KEY_TRAIN # [KEY_TRAIN, KEY_EVAL] # in train we have 270 WSIs, in test we have 129WSIs
                , KEY_PATCHES_TRAIN    : 1000
                , KEY_TUMOR_PERC_TRAIN : 0.5
            }

        # Step 1 - Extract a level-based .json containing (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image)
        if 0:

            patient_paths_imgs = [each for each in Path(DIR_RAW).glob('*') if POSTFIX_MASK not in each.parts[-1] and POSTFIX_INFO_JSON not in each.parts[-1]]
            with tqdm.tqdm(total=len(patient_paths_imgs)) as pbar:
                for patient_path_img in patient_paths_imgs:
                    params[KEY_PATIENT_ID]   = int(Path(patient_path_img).parts[-1].split('_')[1].split(EXT_TIF)[0])
                    params[KEY_PATIENT_TYPE] = Path(patient_path_img).parts[-1].split('_')[0]

                    parse_patient(params)
                    pbar.update(1)

        # Step 2 - Use the level-based .json file and loop over the samples to understand the speed
        if 1:

            for (X,Y) in generator(params):
                # print (X.shape, Y.shape)
                # pdb.set_trace()
                pass


    except:
        print ('\n - [__main__] ')
        traceback.print_exc()
        pdb.set_trace()

    pdb.set_trace()
	"""
	CAMELYON 16 DATASET
	- Whole Slide Images (WSI) containing histopathological information on breast cancer

	1. Download
	- To view the list of AWS
	- Link: https://aws.amazon.com/marketplace/pp/prodview-exkvqrznup6vc?sr=0-1&ref_=beagle&applicationId=AWSMPContessa#resources
	- Click on Resources on AWS --> View Resources
	- Single Sample
	- aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/images/tumor_032.tif ./raw/tumor_032.tif
	- aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/masks/tumor_032_mask.tif ./raw/tumor_032_mask.tif
	- aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/annotations/tumor_032.xml ./raw/tumor_032.xml
	- Full dataset
	- aws s3 cp --recursive --no-sign-request s3://camelyon-dataset/CAMELYON16/images/ ./raw/ # 700GB
	- aws s3 cp --recursive --no-sign-request s3://camelyon-dataset/CAMELYON16/masks/ ./raw/ # 8.76GB

	2. To view
	- Download ASAP: https://github.com/computationalpathologygroup/ASAP/releases
	- Ensure that it corresponds with your python version if you want to do programmatic access
	- Make sure that the ASAP bin path (e.g. C:\Program Files\ASAP 2.1\bin) is either in your sys.path
	"""

	# Import ASAP lib first!
	import sys
	sys.path.append('C:\\Program Files\\ASAP 2.1\\bin')
	import multiresolutionimageinterface as mir
	reader = mir.MultiResolutionImageReader()

	# Import public libs
	import pdb
	import tqdm
	import time
	import json
	import shutil
	import traceback
	import numpy as np
	from pathlib import Path
	import matplotlib.pyplot as plt

	# Init keys - paths
	DATASET_NAME = 'CAMELYON16'
	DIRNAME_RAW = 'raw'
	DIRNAME_TMP = '_tmp'
	DIR_RAW = Path(__file__).parent.absolute().joinpath(DIRNAME_RAW)
	DIR_TMP = Path(__file__).parent.absolute().joinpath(DIRNAME_TMP)
	Path(DIR_TMP).mkdir(exist_ok=True, parents=True)

	# Init keys - filenames
	EXT_TIF = '.tif'
	POSTFIX_MASK = 'mask.tif'
	POSTFIX_INFO_JSON = 'info.json'
	FILENAME_IMAGES = '{}_{:03d}.tif'
	FILENAME_MASKS = '{}_{:03d}_mask.tif'
	DESCRIPTOR_PATCH = '{}_{:03d}__Lvl{}__P{}x{}-pad{}__perc{:.2f}'
	FILENAME_INFO = DESCRIPTOR_PATCH + '__info.json'
	FILETYPE_TRAIN_NORMAL = 'normal'
	FILETYPE_TRAIN_TUMOR = 'tumor'
	FILETYPE_TEST = 'test'

	# Init keys - classes
	CLASS_BACKGROUND = 0
	CLASS_NORMAL = 1
	CLASS_TUMOR = 2

	# Init keys - miscellaneous
	KEY_POINTS = 'points'
	KEY_POINTS_TOTAL = 'points_total'
	KEY_POINTS_TISSUE_TOTAL = 'points_tissue_total'
	KEY_POINTS_TUMOR_TOTAL = 'points_tumor_total'
	KEY_POINTS_TISSUE = 'points_tissue'
	KEY_POINTS_TUMOR = 'points_tumor'
	KEY_TRAIN = 'train'
	KEY_EVAL = 'eval'
	KEY_PATIENT_ID = 'patient_id'
	KEY_PATIENT_TYPE = 'patient_type'
	KEY_PATIENT_LEVEL = 'patient_level'
	KEY_PATIENT_MINTISSUE_PERC = 'patient_tissue_perc'
	KEY_PATCH_WIDTH = 'patch_width'
	KEY_PATCH_HEIGHT = 'patch_height'
	KEY_PATCH_PAD = 'patch_pad'
	KEY_MIN_TISSUE_PERC = 'min_tissue_perc'
	KEY_SAVE_IMGS = 'save_imgs'
	KEY_TOTAL_LEVELS = 'total_levels'
	KEY_MAX_IMG_W = 'max_img_w'
	KEY_MAX_IMG_H = 'max_img_h'
	KEY_MODE = 'mode'
	KEY_PATCHES_TRAIN = 'key_patches_train'
	KEY_TUMOR_PERC_TRAIN = 'tumor_perc_train'
	KEY_DATASET_TYPE = 'dataset_type'
	KEY_DATASET_TRAIN = 'dataset_train'
	KEY_DATASET_TEST = 'dataset_test'

	######### coordinate convention
	# \| (0,0)
	# \| ----------------------> (w)
	# \|
	# \|
	# \|
	# V (h)



	def parse_patient(params):
	"""
	# Extract a level-based .json file on (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image)
	"""

	t0 = time.time()

	try:

	# Step 0 - Params
	patient_id = params[KEY_PATIENT_ID]
	patient_type = params[KEY_PATIENT_TYPE]
	patient_level = params[KEY_PATIENT_LEVEL]

	patch_width = params[KEY_PATCH_WIDTH]
	patch_height = params[KEY_PATCH_HEIGHT]
	patch_pad = params[KEY_PATCH_PAD]
	total_patch_pixels = patch_width * patch_height

	min_tissue_perc = params[KEY_MIN_TISSUE_PERC]

	save_imgs = params[KEY_SAVE_IMGS]

	# Step 1 - Read Data
	path_img = Path(DIR_RAW).joinpath(FILENAME_IMAGES.format(patient_type, patient_id))
	path_mask = Path(DIR_RAW).joinpath(FILENAME_MASKS.format(patient_type, patient_id))

	path_exists = True
	if Path(path_img).exists() and Path(path_mask).exists():
	wsi_img = reader.open(str(path_img))
	wsi_mask = reader.open(str(path_mask))
	ds_factor = wsi_mask.getLevelDownsample(patient_level)

	# Step 2 - Grid the WSI and save the (hmin, wmin) coords
	img_max_w, img_max_h = wsi_img.getLevelDimensions(patient_level)
	points_w, points_h = np.meshgrid(np.linspace(0, img_max_w, int(img_max_w//(patch_width - patch_pad))+1), np.linspace(0, img_max_h, int(img_max_h//(patch_height - patch_pad))+1)) # create a grid for ((patch_width - patch_pad)
	points_w, points_h = points_w.astype(int), points_h.astype(int)
	patches_total = len(points_w.flatten())

	# Step 3 - Prep for saving
	points_tissue = []
	points_tumor = []
	DIR_TMP_PATIENT = Path(DIR_TMP).joinpath((DESCRIPTOR_PATCH + '__N{}').format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc, patches_total))
	Path(DIR_TMP_PATIENT).mkdir(exist_ok=True, parents=True)

	with tqdm.tqdm(total=patches_total, leave=False, desc=' - [{}] '.format(FILENAME_IMAGES.format(patient_type, patient_id))) as pbar_patient:
	for patch_id, (point_w, point_h) in enumerate(zip(points_w.flatten(), points_h.flatten())):

	# Step 3.1 - Get mask patch
	wsi_patch_mask = np.array(wsi_mask.getUCharPatch(int((point_w - patch_pad//2) * ds_factor), int((point_h - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))

	# Step 3.2 - Check if mask patch contains > min_tissue_perc
	wsi_patch_mask_tumor_bool = CLASS_TUMOR in wsi_patch_mask
	wsi_patch_mask_tissueperc = np.count_nonzero(wsi_patch_mask) / total_patch_pixels
	if wsi_patch_mask_tissueperc >= min_tissue_perc:
	points_tissue.append([point_w, point_h])

	if save_imgs:
	tumor_str = ''
	show_perc = 0.05
	if wsi_patch_mask_tumor_bool:
	tumor_str = '__Tumor'
	show_perc = 1.0

	if np.random.random() < show_perc:
	wsi_patch_image = np.array(wsi_image.getUCharPatch(int((point_w - patch_pad//2) * ds_factor), int((point_h - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))
	f,axarr = plt.subplots(1,2)
	axarr[0].imshow(wsi_patch_image)
	# axarr[1].imshow(wsi_patch_image)
	op = axarr[1].imshow(wsi_patch_mask, cmap='magma', alpha=1.0, vmin=CLASS_BACKGROUND, vmax=CLASS_TUMOR)
	plt.colorbar(op, ax=axarr.ravel().tolist())
	plt.suptitle('Level={} \n (patch=({}, {})(pad={}) from img=({},{})) \n Clases=(Bgd={}, Normal={}, Tumor={})'.format(patient_level, patch_width, patch_height, patch_pad, img_max_w, img_max_h, CLASS_BACKGROUND, CLASS_NORMAL, CLASS_TUMOR))

	plt.savefig(str(DIR_TMP_PATIENT.joinpath('{}-{:03d}__{:06d}-{:06d}__{:.3f}{}.png'.format(patient_type, patient_id, int(point_w), int(point_h), wsi_patch_mask_tissueperc, tumor_str))))
	plt.close()

	# print (' - [Lvl:{}][{}/{}] (point_w, point_h): {:06d}, {:06d} \|\| perc: {:.3f}'.format(LEVEL, patch_id, patches_total, point_w, point_h, wsi_patch_mask_tissueperc))
	if wsi_patch_mask_tumor_bool:
	# print (' --- Tumor!')
	points_tumor.append([point_w, point_h])

	pbar_patient.update(1)
	else:
	print (' - \n [ERROR][parse_patient()] Path issues: patient_type: {} \| patient_id: {} \n'.format(patient_type, patient_id))
	print (' -- path_img : ', path_img)
	print (' -- path_mask: ', path_mask)
	print ('')
	path_exists = False

	# Step 4 - Finalize
	if path_exists:
	points_tissue = np.sort(np.array(points_tissue), axis=1)
	if len(points_tumor):
	points_tumor = np.sort(np.array(points_tumor), axis=1)
	else:
	points_tumor = np.array(points_tumor)
	if save_imgs:
	print (' - Total Patches = ', patches_total)
	print (' - Total patches(tissue) = ', points_tissue.shape)
	print (' - Total patches(tumor) = ', points_tumor.shape)
	print (' - Total time taken : {:.2f}'.format(time.time() - t0) )

	if 1:
	DIR_TMP_PATIENT2 = Path(DIR_TMP).joinpath((DESCRIPTOR_PATCH + '__N{}-{}-{}').format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc, patches_total, len(points_tissue), len(points_tumor)))
	if Path(DIR_TMP_PATIENT2).exists():
	shutil.rmtree(DIR_TMP_PATIENT2)
	shutil.move(src=str(DIR_TMP_PATIENT), dst=str(DIR_TMP_PATIENT2))

	# Step 5 - Save level-based .json containing (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image)
	path_json = Path(DIR_RAW).joinpath(FILENAME_INFO.format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc))
	data_json = {
	KEY_PATIENT_TYPE : patient_type
	, KEY_PATIENT_ID : patient_id
	, KEY_PATIENT_LEVEL : patient_level
	, KEY_TOTAL_LEVELS : wsi_img.getNumberOfLevels()
	, KEY_MAX_IMG_W : img_max_w
	, KEY_MAX_IMG_H : img_max_h
	, KEY_PATCH_WIDTH : patch_width
	, KEY_PATCH_HEIGHT : patch_height
	, KEY_PATCH_PAD : patch_pad
	, KEY_MIN_TISSUE_PERC : min_tissue_perc
	, KEY_POINTS_TOTAL : patches_total
	, KEY_POINTS_TISSUE_TOTAL : len(points_tissue)
	, KEY_POINTS_TUMOR_TOTAL : len(points_tumor)
	, KEY_POINTS_TISSUE : points_tissue.tolist()
	, KEY_POINTS_TUMOR : points_tumor.tolist()
	}
	with open(str(path_json), 'w') as fp:
	json.dump(data_json, fp, indent=4)

	except:
	print ('\n - [ERROR][parse_patient()] patient_type: {} \| patient_id: {} \n'.format(patient_type, patient_id))
	traceback.print_exc()
	print ('\n - [ERROR][parse_patient()] patient_type: {} \| patient_id: {} \n'.format(patient_type, patient_id))

	def get_patient_patches(params):

	try:

	res = []

	# Step 1 - Params - Patient
	patient_id = params[KEY_PATIENT_ID]
	patient_type = params[KEY_PATIENT_TYPE]
	patient_level = params[KEY_PATIENT_LEVEL]
	min_tissue_perc = params[KEY_MIN_TISSUE_PERC]

	patch_width = params[KEY_PATCH_WIDTH]
	patch_height = params[KEY_PATCH_HEIGHT]
	patch_pad = params[KEY_PATCH_PAD]

	mode = params[KEY_MODE] # [KEY_TRAIN, KEY_EVAL]
	patches_train = params[KEY_PATCHES_TRAIN]
	patches_perc_tumor_train = params[KEY_TUMOR_PERC_TRAIN]


	# Step 2 - Read .json file
	path_json = Path(DIR_RAW).joinpath(FILENAME_INFO.format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc))
	if not Path(path_json).exists():
	parse_patient(params)

	if Path(path_json).exists():
	with open(str(path_json), 'r') as fp:
	json_data = json.load(fp)

	# Step 2.2 - Extract vals from .json file
	points_tissue = json_data[KEY_POINTS_TISSUE]
	points_tumor = json_data[KEY_POINTS_TUMOR]

	# Step 2.3 - Extract random patches if in training mode
	if mode == KEY_TRAIN:

	for _ in range(patches_train):

	if len(points_tumor):
	if np.random.random() < patches_perc_tumor_train:
	idx = np.random.randint(0, len(points_tumor))
	points = points_tumor[idx]
	else:
	idx = np.random.randint(0, len(points_tissue))
	points = points_tissue[idx]
	else:
	idx = np.random.randint(0, len(points_tissue))
	points = points_tissue[idx]

	points = np.array(points) + np.random.randint(0,patch_pad,2)
	res.append(points.tolist())

	elif mode == KEY_EVAL:
	res = points_tissue

	else:
	print (' - \n [ERROR][get_patient_patches()] Path issues: patient_type: {} \| patient_id: {} \n'.format(patient_type, patient_id))
	print (' -- path_json : ', path_json)
	print ('')

	except:
	print ('\n - [ERROR][get_patient_patches()] patient_type: {} \| patient_id: {} \n'.format(patient_type, patient_id))
	traceback.print_exc()
	print ('\n - [ERROR][get_patient_patches()] patient_type: {} \| patient_id: {} \n'.format(patient_type, patient_id))

	return res

	def generator(params):

	try:

	# Step 0 - Init
	res = {}

	# Step 1 - Get (wmin,hmin) for patient patches
	# Step 1.1 - Get paths as per KEY_DATASET_TYPE
	dataset_type = params[KEY_DATASET_TYPE]
	patient_paths_imgs = [each for each in Path(DIR_RAW).glob('*') if POSTFIX_MASK not in each.parts[-1] and POSTFIX_INFO_JSON not in each.parts[-1]]
	if dataset_type == KEY_TRAIN:
	patient_paths_imgs = [each for each in patient_paths_imgs if (FILETYPE_TRAIN_NORMAL in Path(each).parts[-1] or FILETYPE_TRAIN_TUMOR in Path(each).parts[-1])]
	else:
	patient_paths_imgs = [each for each in patient_paths_imgs if (FILETYPE_TEST in Path(each).parts[-1])]

	# Step 1.2 - Loop over the paths and get (wmin,hmin) for patches
	with tqdm.tqdm(total=len(patient_paths_imgs)) as pbar:
	for patient_path_img in patient_paths_imgs:
	patient_id = int(Path(patient_path_img).parts[-1].split('_')[1].split(EXT_TIF)[0])
	patient_type = Path(patient_path_img).parts[-1].split('_')[0]
	params[KEY_PATIENT_ID] = patient_id
	params[KEY_PATIENT_TYPE] = patient_type

	res[FILENAME_IMAGES.format(patient_type, patient_id)] = {KEY_POINTS: get_patient_patches(params), KEY_PATIENT_ID: patient_id, KEY_PATIENT_TYPE: patient_type}
	res[FILENAME_IMAGES.format(patient_type, patient_id)][KEY_POINTS_TOTAL] = len(res[FILENAME_IMAGES.format(patient_type, patient_id)][KEY_POINTS])

	# Step 2 - Loop over the patch points
	patch_pad = params[KEY_PATCH_PAD]
	patch_width = params[KEY_PATCH_WIDTH]
	patch_height = params[KEY_PATCH_HEIGHT]
	patient_level = params[KEY_PATIENT_LEVEL]

	SAMPLES_TOTAL = sum(patient_obj[KEY_POINTS_TOTAL] for patient_obj in res.values())

	with tqdm.tqdm(total=SAMPLES_TOTAL) as pbar_generator:
	for patient_key in res:
	patient_id = res[patient_key][KEY_PATIENT_ID]
	patient_type = res[patient_key][KEY_PATIENT_TYPE]

	path_img = Path(DIR_RAW).joinpath(FILENAME_IMAGES.format(patient_type, patient_id))
	wsi_img = reader.open(str(path_img))
	path_mask = Path(DIR_RAW).joinpath(FILENAME_MASKS.format(patient_type, patient_id))
	wsi_mask = reader.open(str(path_mask))
	ds_factor = wsi_mask.getLevelDownsample(patient_level)

	for point in res[patient_key][KEY_POINTS]:

	wsi_patch_mask = np.array(wsi_mask.getUCharPatch(int((point[0] - patch_pad//2) * ds_factor), int((point[1] - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))
	wsi_patch_img = np.array(wsi_img.getUCharPatch( int((point[0] - patch_pad//2) * ds_factor), int((point[1] - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level))

	pbar_generator.update(1)
	yield(wsi_patch_img, wsi_patch_mask)

	except:
	print ('\n - [ERROR][generator()] ')
	traceback.print_exc()
	pdb.set_trace()

	if __name__ == "__main__":

	try:

	if 1:
	params = {
	KEY_PATIENT_LEVEL : 2
	, KEY_MIN_TISSUE_PERC: 0.1
	, KEY_PATCH_WIDTH : 512
	, KEY_PATCH_HEIGHT : 512
	, KEY_PATCH_PAD : 32
	, KEY_SAVE_IMGS : False
	, KEY_DATASET_TYPE : KEY_DATASET_TRAIN # [KEY_DATASET_TRAIN->[KEY_TRAIN, KEY_EVAL], KEY_DATASET_TEST->[KEY_EVAL]]
	, KEY_MODE : KEY_TRAIN # [KEY_TRAIN, KEY_EVAL] # in train we have 270 WSIs, in test we have 129WSIs
	, KEY_PATCHES_TRAIN : 1000
	, KEY_TUMOR_PERC_TRAIN : 0.5
	}

	# Step 1 - Extract a level-based .json containing (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image)
	if 0:

	patient_paths_imgs = [each for each in Path(DIR_RAW).glob('*') if POSTFIX_MASK not in each.parts[-1] and POSTFIX_INFO_JSON not in each.parts[-1]]
	with tqdm.tqdm(total=len(patient_paths_imgs)) as pbar:
	for patient_path_img in patient_paths_imgs:
	params[KEY_PATIENT_ID] = int(Path(patient_path_img).parts[-1].split('_')[1].split(EXT_TIF)[0])
	params[KEY_PATIENT_TYPE] = Path(patient_path_img).parts[-1].split('_')[0]

	parse_patient(params)
	pbar.update(1)

	# Step 2 - Use the level-based .json file and loop over the samples to understand the speed
	if 1:

	for (X,Y) in generator(params):
	# print (X.shape, Y.shape)
	# pdb.set_trace()
	pass


	except:
	print ('\n - [__main__] ')
	traceback.print_exc()
	pdb.set_trace()

	pdb.set_trace()