epifanio/01_data_preparation.ipynb

## 01_data_preparation.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data preparation\n",
    "\n",
    "\n",
    "## File list\n",
    "The directory structure of the actual annotations is quite odd.\n",
    "Below some path magics to extract the filenames for both: images and xml annotations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path, PosixPath"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set the path to where the annotations are"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotations = '/home/epinux/annotate2/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jpeg_files = [str(i) for i in itertools.chain.from_iterable(\n",
    "            [list(i.glob('*.jpg')) for i in itertools.chain.from_iterable(\n",
    "                [sorted(i.glob('*')) for i in sorted(Path(annotations).glob('*'))])])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(jpeg_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xml_files = [str(i) for i in itertools.chain.from_iterable(\n",
    "            [list(i.glob('*.xml')) for i in itertools.chain.from_iterable(\n",
    "                [sorted(i.glob('*')) for i in sorted(Path(annotations).glob('*'))])])]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Subsampling\n",
    "\n",
    "Reduce the `xml` annotation files with a reduce factor $R_f$\n",
    "Starting value:\n",
    "\n",
    "$$\n",
    "R_f=0.125 \\quad \\text{which will reduce the amount of files to 12.5% of the total}\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "reduce_factor = 0.125\n",
    "np.random.seed(0)\n",
    "msk = np.random.rand(len(xml_files), ) < reduce_factor\n",
    "sample = list(np.array(xml_files)[msk])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Annotation Parsing\n",
    "\n",
    "Parsing each `xml` file and store the resutls as `pandas.Dataframe`\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "import pandas as pd\n",
    "import xml.etree.ElementTree as ET"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def xml_to_csv(xml_files):\n",
    "    xml_list = []\n",
    "    for xml_file in xml_files:\n",
    "        tree = ET.parse(xml_file)\n",
    "        root = tree.getroot()\n",
    "        for member in root.findall('object'):\n",
    "            value = (root.find('filename').text,\n",
    "                     int(root.find('size')[0].text),\n",
    "                     int(root.find('size')[1].text),\n",
    "                     member[0].text,\n",
    "                     int(member[4][0].text),\n",
    "                     int(member[4][1].text),\n",
    "                     int(member[4][2].text),\n",
    "                     int(member[4][3].text)\n",
    "                     )\n",
    "            xml_list.append(value)\n",
    "    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']\n",
    "    xml_df = pd.DataFrame(xml_list, columns=column_name)\n",
    "    return xml_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "records = xml_to_csv(sample)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get an idea of which labels are in all the annotations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(records['class'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "records.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "records.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Simple statistic description of the sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.style.use('ggplot')\n",
    "\n",
    "plt.figure(figsize=(20,10))\n",
    "records['class'].value_counts().plot(kind='bar')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "plt.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import holoviews as hv\n",
    "# hv.extension('bokeh')\n",
    "# bars = hv.Bars(records['class'].value_counts(), hv.Dimension('index'), 'class').options(width=900, height=500, xrotation= 38)\n",
    "# bars"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Extract only the `sand dollars` annotations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "records = records[records['class']==\"sand dollar\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train & Test\n",
    "\n",
    "Split the sampling in training ($70\\%$) and testing ($30\\%$) dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "msk = np.random.rand(len(records)) < 0.7\n",
    "train = records[msk]\n",
    "test = records[~msk]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test.to_csv('test.csv', index=False)\n",
    "train.to_csv('train.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TensorFlow records\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for converting the csv/pandas dataframe into TFRecord format\n",
    "# https://stackoverflow.com/questions/41402332/tensorflow-create-a-tfrecords-file-from-csv\n",
    "\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# The function takes in the all of the features of a single annotation instance as a list, and then also the label as its own variable\n",
    "# it creates a TFRecord (see below cell for how the format looks, similar to XML)\n",
    "\n",
    "def create_tf_example(features, label):\n",
    "\n",
    "    tf_example = tf.train.Example(features=tf.train.Features(feature={\n",
    "        'filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[0].encode('utf-8')])),\n",
    "        'width':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[1])])),\n",
    "        'height':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[2])])),\n",
    "        'class':tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode('utf-8')])),\n",
    "        'xmin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[3])])),\n",
    "        'ymin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[4])])),\n",
    "        'xmax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[5])])),\n",
    "        'ymax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[6])])),\n",
    "    }))\n",
    "    return tf_example\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loops through all of the rows in the pandas dataframe and individually converts each annotation instance into the TFRecord format\n",
    "# note that this loop breaks after a single annotation just to provide an example. Remove the break when actually using!\n",
    "# Saves/writes the output in root folder\n",
    "with tf.python_io.TFRecordWriter(\"dataset.tfrecords\") as writer:\n",
    "    for index, row in train.iterrows():\n",
    "        features = np.array(list(row[0:3].values) + list(row[4:].values))      \n",
    "        # All of the features in the row, minus the class label\n",
    "        label = row[3]                     \n",
    "        # just the class label\n",
    "        example = create_tf_example(features, label)  \n",
    "        # creates a TFRecord\n",
    "        writer.write(example.SerializeToString())\n",
    "        # break for example, remove otherwise to run entire set\n",
    "        \n",
    "writer.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!gist test.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!gist train.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!gist 01_data_preparation.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Data preparation\n",
	"\n",
	"\n",
	"## File list\n",
	"The directory structure of the actual annotations is quite odd.\n",
	"Below some path magics to extract the filenames for both: images and xml annotations."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"from pathlib import Path, PosixPath"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import itertools"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# set the path to where the annotations are"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"annotations = '/home/epinux/annotate2/'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"jpeg_files = [str(i) for i in itertools.chain.from_iterable(\n",
	" [list(i.glob('*.jpg')) for i in itertools.chain.from_iterable(\n",
	" [sorted(i.glob('')) for i in sorted(Path(annotations).glob(''))])])]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"len(jpeg_files)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"xml_files = [str(i) for i in itertools.chain.from_iterable(\n",
	" [list(i.glob('*.xml')) for i in itertools.chain.from_iterable(\n",
	" [sorted(i.glob('')) for i in sorted(Path(annotations).glob(''))])])]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Subsampling\n",
	"\n",
	"Reduce the `xml` annotation files with a reduce factor $R_f$\n",
	"Starting value:\n",
	"\n",
	"$$\n",
	"R_f=0.125 \\quad \\text{which will reduce the amount of files to 12.5% of the total}\n",
	"$$"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"reduce_factor = 0.125\n",
	"np.random.seed(0)\n",
	"msk = np.random.rand(len(xml_files), ) < reduce_factor\n",
	"sample = list(np.array(xml_files)[msk])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"len(sample)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"sample[:5]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Annotation Parsing\n",
	"\n",
	"Parsing each `xml` file and store the resutls as `pandas.Dataframe`\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import glob\n",
	"import pandas as pd\n",
	"import xml.etree.ElementTree as ET"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def xml_to_csv(xml_files):\n",
	" xml_list = []\n",
	" for xml_file in xml_files:\n",
	" tree = ET.parse(xml_file)\n",
	" root = tree.getroot()\n",
	" for member in root.findall('object'):\n",
	" value = (root.find('filename').text,\n",
	" int(root.find('size')[0].text),\n",
	" int(root.find('size')[1].text),\n",
	" member[0].text,\n",
	" int(member[4][0].text),\n",
	" int(member[4][1].text),\n",
	" int(member[4][2].text),\n",
	" int(member[4][3].text)\n",
	" )\n",
	" xml_list.append(value)\n",
	" column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']\n",
	" xml_df = pd.DataFrame(xml_list, columns=column_name)\n",
	" return xml_df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"records = xml_to_csv(sample)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Get an idea of which labels are in all the annotations"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"list(records['class'].unique())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"records.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"records.describe()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Simple statistic description of the sample"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import matplotlib.pyplot as plt\n",
	"\n",
	"plt.style.use('ggplot')\n",
	"\n",
	"plt.figure(figsize=(20,10))\n",
	"records['class'].value_counts().plot(kind='bar')\n",
	"plt.tight_layout()\n",
	"plt.show()\n",
	"plt.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# import holoviews as hv\n",
	"# hv.extension('bokeh')\n",
	"# bars = hv.Bars(records['class'].value_counts(), hv.Dimension('index'), 'class').options(width=900, height=500, xrotation= 38)\n",
	"# bars"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Extract only the `sand dollars` annotations"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"records = records[records['class']==\"sand dollar\"]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Train & Test\n",
	"\n",
	"Split the sampling in training ($70\\%$) and testing ($30\\%$) dataset"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"msk = np.random.rand(len(records)) < 0.7\n",
	"train = records[msk]\n",
	"test = records[~msk]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"train.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"test.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"test.to_csv('test.csv', index=False)\n",
	"train.to_csv('train.csv', index=False)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## TensorFlow records\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# for converting the csv/pandas dataframe into TFRecord format\n",
	"# https://stackoverflow.com/questions/41402332/tensorflow-create-a-tfrecords-file-from-csv\n",
	"\n",
	"import pandas as pd\n",
	"import tensorflow as tf\n",
	"import numpy as np\n",
	"\n",
	"import warnings\n",
	"warnings.filterwarnings('ignore')\n",
	"\n",
	"# The function takes in the all of the features of a single annotation instance as a list, and then also the label as its own variable\n",
	"# it creates a TFRecord (see below cell for how the format looks, similar to XML)\n",
	"\n",
	"def create_tf_example(features, label):\n",
	"\n",
	" tf_example = tf.train.Example(features=tf.train.Features(feature={\n",
	" 'filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[0].encode('utf-8')])),\n",
	" 'width':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[1])])),\n",
	" 'height':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[2])])),\n",
	" 'class':tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode('utf-8')])),\n",
	" 'xmin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[3])])),\n",
	" 'ymin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[4])])),\n",
	" 'xmax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[5])])),\n",
	" 'ymax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[6])])),\n",
	" }))\n",
	" return tf_example\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Loops through all of the rows in the pandas dataframe and individually converts each annotation instance into the TFRecord format\n",
	"# note that this loop breaks after a single annotation just to provide an example. Remove the break when actually using!\n",
	"# Saves/writes the output in root folder\n",
	"with tf.python_io.TFRecordWriter(\"dataset.tfrecords\") as writer:\n",
	" for index, row in train.iterrows():\n",
	" features = np.array(list(row[0:3].values) + list(row[4:].values)) \n",
	" # All of the features in the row, minus the class label\n",
	" label = row[3] \n",
	" # just the class label\n",
	" example = create_tf_example(features, label) \n",
	" # creates a TFRecord\n",
	" writer.write(example.SerializeToString())\n",
	" # break for example, remove otherwise to run entire set\n",
	" \n",
	"writer.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"example"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!gist test.csv"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!gist train.csv"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!gist 01_data_preparation.ipynb"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}