-
-
Save angadbajwa23/595c05a361077dab3b878a15a691c5d6 to your computer and use it in GitHub Desktop.
Data_Fetching
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"colab_type": "code", | |
"id": "C8ygXqoQdq_H", | |
"outputId": "3be12076-eb35-4305-f06f-ede8656f22fa" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n" | |
] | |
} | |
], | |
"source": [ | |
"from google.colab import drive\n", | |
"drive.mount('/content/gdrive')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"colab_type": "code", | |
"id": "CVFJ4U2GduEI", | |
"outputId": "0e7a79f5-fba3-49a0-bb98-ec191f9a4d42" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"/content/gdrive/My Drive/2d\n" | |
] | |
} | |
], | |
"source": [ | |
"%cd gdrive/My Drive/2d" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"colab_type": "text", | |
"id": "nbAf6JWerVr6" | |
}, | |
"source": [ | |
"### Downloading the dataset from [figshare.com/articles/brain_tumor_dataset/1512427](figshare.com/articles/brain_tumor_dataset/1512427)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 202 | |
}, | |
"colab_type": "code", | |
"id": "Cmz08fU3eAFQ", | |
"outputId": "2ba55caf-72e6-446c-a25a-d31b66f7db5b" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"--2020-05-18 06:28:44-- https://ndownloader.figshare.com/articles/1512427/versions/5\n", | |
"Resolving ndownloader.figshare.com (ndownloader.figshare.com)... 34.251.18.86, 34.240.222.171, 34.252.153.30, ...\n", | |
"Connecting to ndownloader.figshare.com (ndownloader.figshare.com)|34.251.18.86|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 879501695 (839M) [application/zip]\n", | |
"Saving to: ‘5’\n", | |
"\n", | |
"5 100%[===================>] 838.76M 23.7MB/s in 38s \n", | |
"\n", | |
"2020-05-18 06:29:22 (22.2 MB/s) - ‘5’ saved [879501695/879501695]\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"!wget https://ndownloader.figshare.com/articles/1512427/versions/5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 134 | |
}, | |
"colab_type": "code", | |
"id": "MbL09Q9neF9i", | |
"outputId": "31dac3f4-841a-4061-ea93-e75147e6a723" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Archive: 5\n", | |
" extracting: brainTumorDataPublic_1-766.zip \n", | |
" extracting: brainTumorDataPublic_1533-2298.zip \n", | |
" extracting: brainTumorDataPublic_767-1532.zip \n", | |
" extracting: brainTumorDataPublic_2299-3064.zip \n", | |
" extracting: cvind.mat \n", | |
" extracting: README.txt \n" | |
] | |
} | |
], | |
"source": [ | |
"!unzip 5 && rm 5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": {}, | |
"colab_type": "code", | |
"id": "eVF7_xNbeJFh" | |
}, | |
"outputs": [], | |
"source": [ | |
"!cat brainTumorDataPublic_* > brainTumorDataPublic_temp.zip\n", | |
"!zip -FF brainTumorDataPublic_temp.zip --out data.zip" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": {}, | |
"colab_type": "code", | |
"id": "Gh7oUYHMeMWU" | |
}, | |
"outputs": [], | |
"source": [ | |
"!rm brainTumorDataPublic_*" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": {}, | |
"colab_type": "code", | |
"id": "8tcBi2kHeT2I" | |
}, | |
"outputs": [], | |
"source": [ | |
"!unzip data.zip -d data && rm data.zip" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"colab_type": "code", | |
"id": "K0IcaIDeeUVn", | |
"outputId": "74e02ae0-6a71-4ffe-fd11-13bf851b5432" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3064\n" | |
] | |
} | |
], | |
"source": [ | |
"!ls data | wc -l" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 151 | |
}, | |
"colab_type": "code", | |
"id": "BAMRX6SveW-0", | |
"outputId": "5bf6ec54-97e0-40a8-8a4a-5294f40f5c88" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Collecting hdf5storage\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/79/e0/5dd25068a231cd817265529368aca2f918049b290dcb2fd9b24ce136adf4/hdf5storage-0.1.15-py2.py3-none-any.whl (56kB)\n", | |
"\r", | |
"\u001b[K |█████▊ | 10kB 15.8MB/s eta 0:00:01\r", | |
"\u001b[K |███████████▌ | 20kB 1.7MB/s eta 0:00:01\r", | |
"\u001b[K |█████████████████▎ | 30kB 2.3MB/s eta 0:00:01\r", | |
"\u001b[K |███████████████████████ | 40kB 1.6MB/s eta 0:00:01\r", | |
"\u001b[K |████████████████████████████▉ | 51kB 2.0MB/s eta 0:00:01\r", | |
"\u001b[K |████████████████████████████████| 61kB 1.7MB/s \n", | |
"\u001b[?25hRequirement already satisfied: numpy; python_version >= \"3.4\" in /usr/local/lib/python3.6/dist-packages (from hdf5storage) (1.18.4)\n", | |
"Requirement already satisfied: h5py>=2.1; python_version >= \"3.3\" in /usr/local/lib/python3.6/dist-packages (from hdf5storage) (2.10.0)\n", | |
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from h5py>=2.1; python_version >= \"3.3\"->hdf5storage) (1.12.0)\n", | |
"Installing collected packages: hdf5storage\n", | |
"Successfully installed hdf5storage-0.1.15\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install hdf5storage" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"colab_type": "text", | |
"id": "HEdvXiYasKEC" | |
}, | |
"source": [ | |
"### Creating images, labels and masks numpy arrays " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 101 | |
}, | |
"colab_type": "code", | |
"id": "ImwdQyMGeZAA", | |
"outputId": "e7346546-d58e-43b0-a968-06aeab112f5f" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[3064/3064] images loaded: 100.0 %\n", | |
"labels: (3064,)\n", | |
"images: (3064, 512, 512)\n", | |
"masks: (3064, 512, 512)\n", | |
"labels.npy, images.npy, masks.npy saved in /content/gdrive/My Drive/2d/\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"import argparse\n", | |
"import sys\n", | |
"import numpy as np\n", | |
"import hdf5storage\n", | |
"import cv2\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"class NoDataFound(Exception):\n", | |
" pass\n", | |
"\n", | |
"\n", | |
"def dir_path(path):\n", | |
" \"\"\"Check the path and the existence of a data directory\"\"\"\n", | |
" # replace '\\' in path for Windows users\n", | |
" path = path.replace('\\\\', '/')\n", | |
" data_path = os.path.join(path, 'data').replace('\\\\', '/')\n", | |
"\n", | |
" if os.path.isdir(data_path):\n", | |
" return path\n", | |
" elif os.path.isdir(path):\n", | |
" raise NoDataFound('Could not find a \"data\" folder inside directory. {} does not exist.'\n", | |
" .format(data_path))\n", | |
" else:\n", | |
" raise NotADirectoryError(path)\n", | |
"\n", | |
"path='/content/gdrive/My Drive/2d/'\n", | |
"parser = argparse.ArgumentParser()\n", | |
"parser.add_argument('path', help='path to the brain_tumor_dataset directory', type=dir_path)\n", | |
"parser.add_argument('--image-dimension', '-d', default=512, help='dimension of the image', type=int)\n", | |
"args = parser.parse_args(args=[path])\n", | |
"\n", | |
"labels = []\n", | |
"images = []\n", | |
"masks = []\n", | |
"\n", | |
"data_dir = os.path.join(args.path, 'data').replace('\\\\', '/')\n", | |
"files = os.listdir(data_dir)\n", | |
"for i, file in enumerate(files, start=1):\n", | |
" if i % 10 == 0:\n", | |
" # print the percentage of images loaded\n", | |
" sys.stdout.write('\\r[{}/{}] images loaded: {:.1f} %'\n", | |
" .format(i, len(files), i / float(len(files)) * 100))\n", | |
" sys.stdout.flush()\n", | |
"\n", | |
" # load matlab file with hdf5storage as scipy.io.loadmat does not support v7.3 files\n", | |
" mat_file = hdf5storage.loadmat(os.path.join(data_dir, file))['cjdata'][0]\n", | |
"\n", | |
" # resize image and mask to a unique size\n", | |
" image = cv2.resize(mat_file[2], dsize=(args.image_dimension, args.image_dimension),\n", | |
" interpolation=cv2.INTER_CUBIC)\n", | |
" mask = cv2.resize(mat_file[4].astype('uint8'), dsize=(args.image_dimension, args.image_dimension),\n", | |
" interpolation=cv2.INTER_CUBIC)\n", | |
"\n", | |
" labels.append(int(mat_file[0]))\n", | |
" images.append(image)\n", | |
" masks.append(mask.astype(bool))\n", | |
"\n", | |
"sys.stdout.write('\\r[{}/{}] images loaded: {:.1f} %'\n", | |
" .format(i, len(files), i / float(len(files)) * 100))\n", | |
"sys.stdout.flush()\n", | |
"\n", | |
"labels = np.array(labels)\n", | |
"images = np.array(images)\n", | |
"masks = np.array(masks)\n", | |
"\n", | |
"print('\\nlabels:', labels.shape)\n", | |
"print('images:', images.shape)\n", | |
"print('masks:', masks.shape)\n", | |
"\n", | |
"np.save(os.path.join(args.path, 'labels.npy'), labels)\n", | |
"np.save(os.path.join(args.path, 'images.npy'), images)\n", | |
"np.save(os.path.join(args.path, 'masks.npy'), masks)\n", | |
"\n", | |
"print('labels.npy, images.npy, masks.npy saved in', args.path)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": {}, | |
"colab_type": "code", | |
"id": "g5nRl-jvefUt" | |
}, | |
"outputs": [], | |
"source": [ | |
"integer_to_class = {'1': 'meningioma (1)', '2': 'glioma (2)', '3': 'pituitary tumor (3)'}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 0, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 67 | |
}, | |
"colab_type": "code", | |
"id": "4bYzy46Tekrd", | |
"outputId": "c2196313-227d-4689-c682-3e9e2d829424" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(3064,)\n", | |
"(3064, 512, 512)\n", | |
"(3064, 512, 512)\n" | |
] | |
} | |
], | |
"source": [ | |
"labels = np.load('labels.npy')\n", | |
"images = np.load('images.npy')\n", | |
"masks = np.load('masks.npy')\n", | |
"\n", | |
"print(labels.shape)\n", | |
"print(images.shape)\n", | |
"print(masks.shape)" | |
] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"collapsed_sections": [], | |
"name": "Data_exploration", | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment