artirj/Fisheries.ipynb

## Fisheries.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Fisheries kaggle competition"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 1. Examine data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.image as mpimg\n",
    "from glob import glob\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import keras\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense,BatchNormalization,Convolution2D,Flatten,Dropout, MaxPooling2D\n",
    "from keras.utils.np_utils import to_categorical\n",
    "from keras.preprocessing.image import ImageDataGenerator\n",
    "from keras.optimizers import Adam\n",
    "from keras.callbacks import EarlyStopping\n",
    "def onehot(array):\n",
    "    return to_categorical(array)\n",
    "USE_SAMPLE=False\n",
    "if USE_SAMPLE:\n",
    "    path='data/sample/'\n",
    "else:\n",
    "    path='data/'\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# print(len(glob(path+'train_orig/**',recursive=True)))\n",
    "# print(len(glob(path+'test/**',recursive=True)))\n",
    "categories=8\n",
    "# print(categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "###  Plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#First, let's have a look at the data\n",
    "def plot_four(path):\n",
    "    imgs=np.random.permutation(glob(path+'*'))[:4]\n",
    "    plt.figure(figsize=(24,12))\n",
    "    \n",
    "    for i,img in enumerate(imgs):\n",
    "        plt.subplot(2,2,i+1,)\n",
    "        #plt.tight_layout()\n",
    "        plt.imshow(mpimg.imread(imgs[i]))\n",
    "        plt.gca().grid(False)\n",
    "        plt.gca().get_yaxis().set_ticks([])\n",
    "        plt.gca().get_xaxis().set_ticks([])\n",
    "        plt.title(imgs[i],fontdict={'fontsize':20})\n",
    "        plt.show()\n",
    "plot_four(path+'test/')\n",
    "plot_four(path+'train/ALB/')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Shape "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Now, let's see the size of the data. I only do test and assume other data is similar\n",
    "def get_data_shape(path):\n",
    "    imgs=np.random.permutation(glob(path+'*'))\n",
    "    lens=[]\n",
    "    for img in imgs:\n",
    "        lens.append(mpimg.imread(img).shape)\n",
    "    return np.array(lens)\n",
    "lens_test=get_data_shape(path+'test/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# def count_rows(unique_items,array):\n",
    "#     dic={}\n",
    "#     for item in unique_items:\n",
    "#         dic[str(item)]=(array==item).sum()\n",
    "#     return dic\n",
    "# def unique_rows(a, **kwargs):\n",
    "\n",
    "#     rowtype = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))\n",
    "#     b = np.ascontiguousarray(a).view(rowtype)\n",
    "#     return_index = kwargs.pop('return_index', False)\n",
    "#     out = np.unique(b, return_index=True, **kwargs)\n",
    "#     idx = out[1]\n",
    "#     uvals = a[idx]\n",
    "#     return uvals\n",
    "# shapes=count_rows(unique_rows(lens_test),lens_test)\n",
    "# from pprint import pprint\n",
    "# pprint(shapes)\n",
    "# plt.figure(figsize=(12,6))\n",
    "# plt.bar(range(len(shapes)),shapes.values());\n",
    "# plt.xticks(range(len(shapes)),[i[:9]+']' for i in shapes.keys()]);\n",
    "# # Would it make sense to use different nets? Maybe!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 2. Create sample "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "from sklearn.model_selection import train_test_split\n",
    "#path=os.path.realpath('')+'/'+path\n",
    "# cats/dogs\n",
    "pre_run=1\n",
    "samplesize=0.1\n",
    "prop_train=0.9\n",
    "\n",
    "\n",
    "if(pre_run==1):\n",
    "    shutil.rmtree(path+'sample',ignore_errors=1)\n",
    "    shutil.rmtree(path+'valid',ignore_errors=1)\n",
    "    shutil.rmtree(path+'train',ignore_errors=1)\n",
    "    os.mkdir(path+'sample')\n",
    "    os.mkdir(path+'valid')\n",
    "    os.mkdir(path+'train')\n",
    "    os.mkdir(path+'sample/train')\n",
    "    os.mkdir(path+'sample/test')\n",
    "    os.mkdir(path+'sample/valid')\n",
    "    dirs=glob(path+'train_orig/*')\n",
    "    for i in dirs:        \n",
    "        subdir=i.split('/')[-1]\n",
    "        os.mkdir(path+'sample/train/'+subdir)\n",
    "        os.mkdir(path+'train/'+subdir)\n",
    "        os.mkdir(path+'sample/valid/'+subdir)\n",
    "        os.mkdir(path+'valid/'+subdir)\n",
    "        train,valid=train_test_split(os.listdir(i),train_size=prop_train,random_state=42)\n",
    "        count=0\n",
    "        for j in valid:\n",
    "            if count<=samplesize*len(valid):\n",
    "                shutil.copy(i+'/'+j,path+'sample/valid/'+subdir+'/'+j)\n",
    "                count+=1\n",
    "            shutil.copy(i+'/'+j,path+'valid/'+subdir+'/'+j)\n",
    "        count=0\n",
    "        for j in train:\n",
    "            if count<=samplesize*len(train):\n",
    "                shutil.copy(i+'/'+j,path+'sample/train/'+subdir+'/'+j)\n",
    "                count+=1\n",
    "            shutil.copy(i+'/'+j,path+'train/'+subdir+'/'+j)\n",
    "    test_imgs=glob(path+'test/*')\n",
    "    np.random.seed(42)\n",
    "    sample_test=np.random.permutation(test_imgs)[:round(len(test_imgs)*samplesize)]    \n",
    "    for i in sample_test:\n",
    "        shutil.copy(i,path+'sample/test/'+i.split('/')[-1])\n",
    "        \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# pics=glob(path+'train/**',recursive=True)\n",
    "# print(len(pics))\n",
    "# pics=glob(path+'valid/**',recursive=True)\n",
    "# print(len(pics))\n",
    "# pics=glob(path+'sample/train/**',recursive=True)\n",
    "# print(len(pics))\n",
    "# pics=glob(path+'sample/valid/**',recursive=True)\n",
    "# print(len(pics))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "gen_train=ImageDataGenerator(rotation_range=10., width_shift_range=0.2, \n",
    "       height_shift_range=0.2, shear_range=0.4, zoom_range=0.2, \n",
    "       channel_shift_range=6.)\n",
    "gen_val=ImageDataGenerator()\n",
    "gen_train=ImageDataGenerator()\n",
    "batches_train=gen_train.flow_from_directory(path+'train',target_size=(224,224),\n",
    "                                            batch_size=64,class_mode='categorical')\n",
    "batches_val=gen_val.flow_from_directory(path+'valid',target_size=(224,224),\n",
    "                                        batch_size=64,class_mode='categorical',shuffle=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 3. Train simple neural net"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\n",
    "categories=len(glob(path+'train_orig/*'))\n",
    "\n",
    "model=Sequential([\n",
    "    BatchNormalization(input_shape=(3,224,224)),\n",
    "    Flatten(),\n",
    "    Dense(512,activation='relu'),\n",
    "    BatchNormalization(),\n",
    "    Dropout(0.2),    \n",
    "    Dense(categories,activation='softmax')\n",
    "])\n",
    "stop=EarlyStopping(patience=2)\n",
    "model.compile(Adam(lr=1e-4),loss='categorical_crossentropy',metrics=['accuracy'])\n",
    "\n",
    "\n",
    "#model.load_weights('basic_linear.dat')\n",
    "def long_train(n):\n",
    "    model.fit_generator(batches_train,nb_epoch=1,validation_data=batches_val,\n",
    "                        nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
    "    model.optimizer.lr=1e-3\n",
    "    model.fit_generator(batches_train,nb_epoch=2,validation_data=batches_val,\n",
    "                        nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
    "    model.optimizer.lr=1e-2\n",
    "    model.fit_generator(batches_train,nb_epoch=1,validation_data=batches_val,\n",
    "                        nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
    "    model.optimizer.lr=1e-4\n",
    "    model.fit_generator(batches_train,nb_epoch=n,validation_data=batches_val,\n",
    "                         nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
    "#long_train(20)\n",
    "#model.load_weights('basic_linear.dat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# model.optimizer.lr=0.5e-4\n",
    "# model.fit_generator(batches_train,nb_epoch=5,validation_data=batches_val,\n",
    "#                          nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 4. Train simple convnet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "class miniNet():\n",
    "    def __init__(self):\n",
    "        self.lr=1e-4\n",
    "        self.stop=EarlyStopping(patience=2)\n",
    "        self.get_model()\n",
    "    def ConvBlock(self,n):\n",
    "        model=self.model\n",
    "        model.add(Convolution2D(n,3,3,activation='relu',border_mode='same'))\n",
    "        model.add(BatchNormalization(axis=1))\n",
    "        model.add(Dropout(0.1))\n",
    "        model.add(Convolution2D(n,3,3,activation='relu',border_mode='same'))\n",
    "        model.add(BatchNormalization(axis=1))\n",
    "        model.add(Dropout(0.1))\n",
    "        model.add(MaxPooling2D())                          \n",
    "    def FlatBlock(self):\n",
    "        model=self.model\n",
    "        model.add(Flatten())\n",
    "        model.add(BatchNormalization()) \n",
    "        model.add(Dense(512,activation='relu'))\n",
    "        model.add(Dropout(0.6))\n",
    "        model.add(BatchNormalization())\n",
    "        model.add(Dense(256,activation='relu'))\n",
    "        model.add(Dropout(0.6))\n",
    "        model.add(BatchNormalization())        \n",
    "        model.add(Dense(categories,activation='softmax'))\n",
    "    def get_model(self):\n",
    "        self.model=Sequential()\n",
    "        self.model.add(BatchNormalization(input_shape=(3,224,224),axis=1)),\n",
    "        self.ConvBlock(64)\n",
    "        self.ConvBlock(128)\n",
    "        self.FlatBlock()\n",
    "        self.compile()\n",
    "        return self.model\n",
    "    def fit(self,train_batch,val_batch,nb_epoch=1):\n",
    "       \n",
    "        self.model.fit_generator(train_batch,nb_epoch=nb_epoch,\n",
    "                                 validation_data=val_batch,\n",
    "                                 nb_val_samples=val_batch.N,\n",
    "                                samples_per_epoch=train_batch.N,verbose=2,callbacks=[self.stop])\n",
    "    def summary(self):\n",
    "        self.model.summary()\n",
    "    def set_lr(self,lr):\n",
    "        self.lr=lr\n",
    "        self.model.optimizer.lr=lr\n",
    "    def compile(self):\n",
    "        self.model.compile(Adam(lr=self.lr),loss='categorical_crossentropy',metrics=['accuracy'])\n",
    "     \n",
    "#CN=miniNet()\n",
    "#CN.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#CN.model.save_weights('mini_cnn_1.dat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# CN.set_lr(1e-4)\n",
    "# CN.fit(batches_train,batches_val,1)\n",
    "\n",
    "# CN.set_lr(1e-3)\n",
    "# CN.fit(batches_train,batches_val,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# CN.set_lr(1e-4)\n",
    "# CN.fit(batches_train,batches_val,10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 5. Train VGG16bn model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from vgg16bn import Vgg16BN\n",
    "vgg=Vgg16BN()\n",
    "vgg.superfinetune(batches_train)\n",
    "def get_mod(batches_train):\n",
    "    mod=Vgg16BN()\n",
    "    mod.superfinetune(batches_train)\n",
    "    return mod"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def train(mo):\n",
    "    mo.fit(batches_train,batches_val,1)\n",
    "    mo.model.optimizer.lr=1e-3\n",
    "    mo.fit(batches_train,batches_val,1)\n",
    "    mo.model.optimizer.lr=1e-4\n",
    "    mo.fit(batches_train,batches_val,5)\n",
    "    mo.model.optimizer.lr=0.25e-4\n",
    "    mo.fit(batches_train,batches_val,20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "model=vgg.model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Ensembling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "models = [get_mod(batches_train) for i in range(2)]\n",
    "for i,m in enumerate(models):\n",
    "    m.model.load_weights(path+'vgg16-'+str(i)+'.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for m in models:\n",
    "    train(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "for i,m in enumerate(models):\n",
    "    m.model.save_weights(path+'vgg16-'+str(i)+'.pkl')\n",
    "gen_test=ImageDataGenerator()\n",
    "   \n",
    "batches_test=gen_test.flow_from_directory(path+'pretest',target_size=(224,224),\n",
    "                                            batch_size=64,shuffle=False)\n",
    "preds=np.array([m.model.predict_generator(batches_test,val_samples=batches_test.N) for m in models])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "preds=preds.mean(axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 6. Test and submit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#gen_test=ImageDataGenerator()\n",
    "#batches_test=gen_test.flow_from_directory(path+'pretest',target_size=(224,224),\n",
    "#                                            batch_size=64,shuffle=False)\n",
    "#preds=model.predict_generator(batches_test,val_samples=batches_test.N)\n",
    "preds=preds.clip(0.01,0.99)\n",
    "def name_batch(i):\n",
    "    return batches_test.filenames[i].split('/')[-1]\n",
    "def string_array(a):\n",
    "    return ','.join(str(x) for x in a)\n",
    "name='sub_vgg2_001_noaugmentation.csv'\n",
    "with open(name,'w') as f:\n",
    "    f.write('image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT\\n')\n",
    "    for i in range(len(batches_test.filenames)):\n",
    "        st=name_batch(i)+','+string_array(preds[i])+'\\n'\n",
    "        f.write(st);\n",
    "\n",
    "        \n",
    "#os.system('kg submit '+name+'-m `Basic net, more iterations`')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import IPython.display as dp\n",
    "\n",
    "dp.display(dp.FileLink(name))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"# Fisheries kaggle competition"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"## 1. Examine data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true,
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"import matplotlib.pyplot as plt\n",
	"import matplotlib.image as mpimg\n",
	"from glob import glob\n",
	"import numpy as np\n",
	"import seaborn as sns\n",
	"import keras\n",
	"from keras.models import Sequential\n",
	"from keras.layers import Dense,BatchNormalization,Convolution2D,Flatten,Dropout, MaxPooling2D\n",
	"from keras.utils.np_utils import to_categorical\n",
	"from keras.preprocessing.image import ImageDataGenerator\n",
	"from keras.optimizers import Adam\n",
	"from keras.callbacks import EarlyStopping\n",
	"def onehot(array):\n",
	" return to_categorical(array)\n",
	"USE_SAMPLE=False\n",
	"if USE_SAMPLE:\n",
	" path='data/sample/'\n",
	"else:\n",
	" path='data/'\n",
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"# print(len(glob(path+'train_orig/**',recursive=True)))\n",
	"# print(len(glob(path+'test/**',recursive=True)))\n",
	"categories=8\n",
	"# print(categories)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"### Plots"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"#First, let's have a look at the data\n",
	"def plot_four(path):\n",
	" imgs=np.random.permutation(glob(path+'*'))[:4]\n",
	" plt.figure(figsize=(24,12))\n",
	" \n",
	" for i,img in enumerate(imgs):\n",
	" plt.subplot(2,2,i+1,)\n",
	" #plt.tight_layout()\n",
	" plt.imshow(mpimg.imread(imgs[i]))\n",
	" plt.gca().grid(False)\n",
	" plt.gca().get_yaxis().set_ticks([])\n",
	" plt.gca().get_xaxis().set_ticks([])\n",
	" plt.title(imgs[i],fontdict={'fontsize':20})\n",
	" plt.show()\n",
	"plot_four(path+'test/')\n",
	"plot_four(path+'train/ALB/')\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"### Shape "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"#Now, let's see the size of the data. I only do test and assume other data is similar\n",
	"def get_data_shape(path):\n",
	" imgs=np.random.permutation(glob(path+'*'))\n",
	" lens=[]\n",
	" for img in imgs:\n",
	" lens.append(mpimg.imread(img).shape)\n",
	" return np.array(lens)\n",
	"lens_test=get_data_shape(path+'test/')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"# def count_rows(unique_items,array):\n",
	"# dic={}\n",
	"# for item in unique_items:\n",
	"# dic[str(item)]=(array==item).sum()\n",
	"# return dic\n",
	"# def unique_rows(a, **kwargs):\n",
	"\n",
	"# rowtype = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))\n",
	"# b = np.ascontiguousarray(a).view(rowtype)\n",
	"# return_index = kwargs.pop('return_index', False)\n",
	"# out = np.unique(b, return_index=True, **kwargs)\n",
	"# idx = out[1]\n",
	"# uvals = a[idx]\n",
	"# return uvals\n",
	"# shapes=count_rows(unique_rows(lens_test),lens_test)\n",
	"# from pprint import pprint\n",
	"# pprint(shapes)\n",
	"# plt.figure(figsize=(12,6))\n",
	"# plt.bar(range(len(shapes)),shapes.values());\n",
	"# plt.xticks(range(len(shapes)),[i[:9]+']' for i in shapes.keys()]);\n",
	"# # Would it make sense to use different nets? Maybe!"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"## 2. Create sample "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"import os\n",
	"import shutil\n",
	"from sklearn.model_selection import train_test_split\n",
	"#path=os.path.realpath('')+'/'+path\n",
	"# cats/dogs\n",
	"pre_run=1\n",
	"samplesize=0.1\n",
	"prop_train=0.9\n",
	"\n",
	"\n",
	"if(pre_run==1):\n",
	" shutil.rmtree(path+'sample',ignore_errors=1)\n",
	" shutil.rmtree(path+'valid',ignore_errors=1)\n",
	" shutil.rmtree(path+'train',ignore_errors=1)\n",
	" os.mkdir(path+'sample')\n",
	" os.mkdir(path+'valid')\n",
	" os.mkdir(path+'train')\n",
	" os.mkdir(path+'sample/train')\n",
	" os.mkdir(path+'sample/test')\n",
	" os.mkdir(path+'sample/valid')\n",
	" dirs=glob(path+'train_orig/*')\n",
	" for i in dirs: \n",
	" subdir=i.split('/')[-1]\n",
	" os.mkdir(path+'sample/train/'+subdir)\n",
	" os.mkdir(path+'train/'+subdir)\n",
	" os.mkdir(path+'sample/valid/'+subdir)\n",
	" os.mkdir(path+'valid/'+subdir)\n",
	" train,valid=train_test_split(os.listdir(i),train_size=prop_train,random_state=42)\n",
	" count=0\n",
	" for j in valid:\n",
	" if count<=samplesize*len(valid):\n",
	" shutil.copy(i+'/'+j,path+'sample/valid/'+subdir+'/'+j)\n",
	" count+=1\n",
	" shutil.copy(i+'/'+j,path+'valid/'+subdir+'/'+j)\n",
	" count=0\n",
	" for j in train:\n",
	" if count<=samplesize*len(train):\n",
	" shutil.copy(i+'/'+j,path+'sample/train/'+subdir+'/'+j)\n",
	" count+=1\n",
	" shutil.copy(i+'/'+j,path+'train/'+subdir+'/'+j)\n",
	" test_imgs=glob(path+'test/*')\n",
	" np.random.seed(42)\n",
	" sample_test=np.random.permutation(test_imgs)[:round(len(test_imgs)*samplesize)] \n",
	" for i in sample_test:\n",
	" shutil.copy(i,path+'sample/test/'+i.split('/')[-1])\n",
	" \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"# pics=glob(path+'train/**',recursive=True)\n",
	"# print(len(pics))\n",
	"# pics=glob(path+'valid/**',recursive=True)\n",
	"# print(len(pics))\n",
	"# pics=glob(path+'sample/train/**',recursive=True)\n",
	"# print(len(pics))\n",
	"# pics=glob(path+'sample/valid/**',recursive=True)\n",
	"# print(len(pics))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"gen_train=ImageDataGenerator(rotation_range=10., width_shift_range=0.2, \n",
	" height_shift_range=0.2, shear_range=0.4, zoom_range=0.2, \n",
	" channel_shift_range=6.)\n",
	"gen_val=ImageDataGenerator()\n",
	"gen_train=ImageDataGenerator()\n",
	"batches_train=gen_train.flow_from_directory(path+'train',target_size=(224,224),\n",
	" batch_size=64,class_mode='categorical')\n",
	"batches_val=gen_val.flow_from_directory(path+'valid',target_size=(224,224),\n",
	" batch_size=64,class_mode='categorical',shuffle=False)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"## 3. Train simple neural net"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"\n",
	"categories=len(glob(path+'train_orig/*'))\n",
	"\n",
	"model=Sequential([\n",
	" BatchNormalization(input_shape=(3,224,224)),\n",
	" Flatten(),\n",
	" Dense(512,activation='relu'),\n",
	" BatchNormalization(),\n",
	" Dropout(0.2), \n",
	" Dense(categories,activation='softmax')\n",
	"])\n",
	"stop=EarlyStopping(patience=2)\n",
	"model.compile(Adam(lr=1e-4),loss='categorical_crossentropy',metrics=['accuracy'])\n",
	"\n",
	"\n",
	"#model.load_weights('basic_linear.dat')\n",
	"def long_train(n):\n",
	" model.fit_generator(batches_train,nb_epoch=1,validation_data=batches_val,\n",
	" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
	" model.optimizer.lr=1e-3\n",
	" model.fit_generator(batches_train,nb_epoch=2,validation_data=batches_val,\n",
	" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
	" model.optimizer.lr=1e-2\n",
	" model.fit_generator(batches_train,nb_epoch=1,validation_data=batches_val,\n",
	" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
	" model.optimizer.lr=1e-4\n",
	" model.fit_generator(batches_train,nb_epoch=n,validation_data=batches_val,\n",
	" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
	"#long_train(20)\n",
	"#model.load_weights('basic_linear.dat')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"# model.optimizer.lr=0.5e-4\n",
	"# model.fit_generator(batches_train,nb_epoch=5,validation_data=batches_val,\n",
	"# nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"## 4. Train simple convnet"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"class miniNet():\n",
	" def __init__(self):\n",
	" self.lr=1e-4\n",
	" self.stop=EarlyStopping(patience=2)\n",
	" self.get_model()\n",
	" def ConvBlock(self,n):\n",
	" model=self.model\n",
	" model.add(Convolution2D(n,3,3,activation='relu',border_mode='same'))\n",
	" model.add(BatchNormalization(axis=1))\n",
	" model.add(Dropout(0.1))\n",
	" model.add(Convolution2D(n,3,3,activation='relu',border_mode='same'))\n",
	" model.add(BatchNormalization(axis=1))\n",
	" model.add(Dropout(0.1))\n",
	" model.add(MaxPooling2D()) \n",
	" def FlatBlock(self):\n",
	" model=self.model\n",
	" model.add(Flatten())\n",
	" model.add(BatchNormalization()) \n",
	" model.add(Dense(512,activation='relu'))\n",
	" model.add(Dropout(0.6))\n",
	" model.add(BatchNormalization())\n",
	" model.add(Dense(256,activation='relu'))\n",
	" model.add(Dropout(0.6))\n",
	" model.add(BatchNormalization()) \n",
	" model.add(Dense(categories,activation='softmax'))\n",
	" def get_model(self):\n",
	" self.model=Sequential()\n",
	" self.model.add(BatchNormalization(input_shape=(3,224,224),axis=1)),\n",
	" self.ConvBlock(64)\n",
	" self.ConvBlock(128)\n",
	" self.FlatBlock()\n",
	" self.compile()\n",
	" return self.model\n",
	" def fit(self,train_batch,val_batch,nb_epoch=1):\n",
	" \n",
	" self.model.fit_generator(train_batch,nb_epoch=nb_epoch,\n",
	" validation_data=val_batch,\n",
	" nb_val_samples=val_batch.N,\n",
	" samples_per_epoch=train_batch.N,verbose=2,callbacks=[self.stop])\n",
	" def summary(self):\n",
	" self.model.summary()\n",
	" def set_lr(self,lr):\n",
	" self.lr=lr\n",
	" self.model.optimizer.lr=lr\n",
	" def compile(self):\n",
	" self.model.compile(Adam(lr=self.lr),loss='categorical_crossentropy',metrics=['accuracy'])\n",
	" \n",
	"#CN=miniNet()\n",
	"#CN.summary()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"#CN.model.save_weights('mini_cnn_1.dat')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"# CN.set_lr(1e-4)\n",
	"# CN.fit(batches_train,batches_val,1)\n",
	"\n",
	"# CN.set_lr(1e-3)\n",
	"# CN.fit(batches_train,batches_val,1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"# CN.set_lr(1e-4)\n",
	"# CN.fit(batches_train,batches_val,10)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"## 5. Train VGG16bn model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"from vgg16bn import Vgg16BN\n",
	"vgg=Vgg16BN()\n",
	"vgg.superfinetune(batches_train)\n",
	"def get_mod(batches_train):\n",
	" mod=Vgg16BN()\n",
	" mod.superfinetune(batches_train)\n",
	" return mod"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"def train(mo):\n",
	" mo.fit(batches_train,batches_val,1)\n",
	" mo.model.optimizer.lr=1e-3\n",
	" mo.fit(batches_train,batches_val,1)\n",
	" mo.model.optimizer.lr=1e-4\n",
	" mo.fit(batches_train,batches_val,5)\n",
	" mo.model.optimizer.lr=0.25e-4\n",
	" mo.fit(batches_train,batches_val,20)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"model=vgg.model"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"### Ensembling"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"models = [get_mod(batches_train) for i in range(2)]\n",
	"for i,m in enumerate(models):\n",
	" m.model.load_weights(path+'vgg16-'+str(i)+'.pkl')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"for m in models:\n",
	" train(m)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"for i,m in enumerate(models):\n",
	" m.model.save_weights(path+'vgg16-'+str(i)+'.pkl')\n",
	"gen_test=ImageDataGenerator()\n",
	" \n",
	"batches_test=gen_test.flow_from_directory(path+'pretest',target_size=(224,224),\n",
	" batch_size=64,shuffle=False)\n",
	"preds=np.array([m.model.predict_generator(batches_test,val_samples=batches_test.N) for m in models])\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"preds=preds.mean(axis=0)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"## 6. Test and submit"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"#gen_test=ImageDataGenerator()\n",
	"#batches_test=gen_test.flow_from_directory(path+'pretest',target_size=(224,224),\n",
	"# batch_size=64,shuffle=False)\n",
	"#preds=model.predict_generator(batches_test,val_samples=batches_test.N)\n",
	"preds=preds.clip(0.01,0.99)\n",
	"def name_batch(i):\n",
	" return batches_test.filenames[i].split('/')[-1]\n",
	"def string_array(a):\n",
	" return ','.join(str(x) for x in a)\n",
	"name='sub_vgg2_001_noaugmentation.csv'\n",
	"with open(name,'w') as f:\n",
	" f.write('image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT\\n')\n",
	" for i in range(len(batches_test.filenames)):\n",
	" st=name_batch(i)+','+string_array(preds[i])+'\\n'\n",
	" f.write(st);\n",
	"\n",
	" \n",
	"#os.system('kg submit '+name+'-m `Basic net, more iterations`')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"import IPython.display as dp\n",
	"\n",
	"dp.display(dp.FileLink(name))\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [conda root]",
	"language": "python",
	"name": "conda-root-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}