Skip to content

Instantly share code, notes, and snippets.

@artirj
Last active March 4, 2017 18:22
Show Gist options
  • Save artirj/254d8865bb70664244f1b3111d7518d4 to your computer and use it in GitHub Desktop.
Save artirj/254d8865bb70664244f1b3111d7518d4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Fisheries kaggle competition"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## 1. Examine data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true,
"scrolled": false
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import matplotlib.image as mpimg\n",
"from glob import glob\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import keras\n",
"from keras.models import Sequential\n",
"from keras.layers import Dense,BatchNormalization,Convolution2D,Flatten,Dropout, MaxPooling2D\n",
"from keras.utils.np_utils import to_categorical\n",
"from keras.preprocessing.image import ImageDataGenerator\n",
"from keras.optimizers import Adam\n",
"from keras.callbacks import EarlyStopping\n",
"def onehot(array):\n",
" return to_categorical(array)\n",
"USE_SAMPLE=False\n",
"if USE_SAMPLE:\n",
" path='data/sample/'\n",
"else:\n",
" path='data/'\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# print(len(glob(path+'train_orig/**',recursive=True)))\n",
"# print(len(glob(path+'test/**',recursive=True)))\n",
"categories=8\n",
"# print(categories)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Plots"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"#First, let's have a look at the data\n",
"def plot_four(path):\n",
" imgs=np.random.permutation(glob(path+'*'))[:4]\n",
" plt.figure(figsize=(24,12))\n",
" \n",
" for i,img in enumerate(imgs):\n",
" plt.subplot(2,2,i+1,)\n",
" #plt.tight_layout()\n",
" plt.imshow(mpimg.imread(imgs[i]))\n",
" plt.gca().grid(False)\n",
" plt.gca().get_yaxis().set_ticks([])\n",
" plt.gca().get_xaxis().set_ticks([])\n",
" plt.title(imgs[i],fontdict={'fontsize':20})\n",
" plt.show()\n",
"plot_four(path+'test/')\n",
"plot_four(path+'train/ALB/')\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Shape "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"#Now, let's see the size of the data. I only do test and assume other data is similar\n",
"def get_data_shape(path):\n",
" imgs=np.random.permutation(glob(path+'*'))\n",
" lens=[]\n",
" for img in imgs:\n",
" lens.append(mpimg.imread(img).shape)\n",
" return np.array(lens)\n",
"lens_test=get_data_shape(path+'test/')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# def count_rows(unique_items,array):\n",
"# dic={}\n",
"# for item in unique_items:\n",
"# dic[str(item)]=(array==item).sum()\n",
"# return dic\n",
"# def unique_rows(a, **kwargs):\n",
"\n",
"# rowtype = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))\n",
"# b = np.ascontiguousarray(a).view(rowtype)\n",
"# return_index = kwargs.pop('return_index', False)\n",
"# out = np.unique(b, return_index=True, **kwargs)\n",
"# idx = out[1]\n",
"# uvals = a[idx]\n",
"# return uvals\n",
"# shapes=count_rows(unique_rows(lens_test),lens_test)\n",
"# from pprint import pprint\n",
"# pprint(shapes)\n",
"# plt.figure(figsize=(12,6))\n",
"# plt.bar(range(len(shapes)),shapes.values());\n",
"# plt.xticks(range(len(shapes)),[i[:9]+']' for i in shapes.keys()]);\n",
"# # Would it make sense to use different nets? Maybe!"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## 2. Create sample "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import os\n",
"import shutil\n",
"from sklearn.model_selection import train_test_split\n",
"#path=os.path.realpath('')+'/'+path\n",
"# cats/dogs\n",
"pre_run=1\n",
"samplesize=0.1\n",
"prop_train=0.9\n",
"\n",
"\n",
"if(pre_run==1):\n",
" shutil.rmtree(path+'sample',ignore_errors=1)\n",
" shutil.rmtree(path+'valid',ignore_errors=1)\n",
" shutil.rmtree(path+'train',ignore_errors=1)\n",
" os.mkdir(path+'sample')\n",
" os.mkdir(path+'valid')\n",
" os.mkdir(path+'train')\n",
" os.mkdir(path+'sample/train')\n",
" os.mkdir(path+'sample/test')\n",
" os.mkdir(path+'sample/valid')\n",
" dirs=glob(path+'train_orig/*')\n",
" for i in dirs: \n",
" subdir=i.split('/')[-1]\n",
" os.mkdir(path+'sample/train/'+subdir)\n",
" os.mkdir(path+'train/'+subdir)\n",
" os.mkdir(path+'sample/valid/'+subdir)\n",
" os.mkdir(path+'valid/'+subdir)\n",
" train,valid=train_test_split(os.listdir(i),train_size=prop_train,random_state=42)\n",
" count=0\n",
" for j in valid:\n",
" if count<=samplesize*len(valid):\n",
" shutil.copy(i+'/'+j,path+'sample/valid/'+subdir+'/'+j)\n",
" count+=1\n",
" shutil.copy(i+'/'+j,path+'valid/'+subdir+'/'+j)\n",
" count=0\n",
" for j in train:\n",
" if count<=samplesize*len(train):\n",
" shutil.copy(i+'/'+j,path+'sample/train/'+subdir+'/'+j)\n",
" count+=1\n",
" shutil.copy(i+'/'+j,path+'train/'+subdir+'/'+j)\n",
" test_imgs=glob(path+'test/*')\n",
" np.random.seed(42)\n",
" sample_test=np.random.permutation(test_imgs)[:round(len(test_imgs)*samplesize)] \n",
" for i in sample_test:\n",
" shutil.copy(i,path+'sample/test/'+i.split('/')[-1])\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# pics=glob(path+'train/**',recursive=True)\n",
"# print(len(pics))\n",
"# pics=glob(path+'valid/**',recursive=True)\n",
"# print(len(pics))\n",
"# pics=glob(path+'sample/train/**',recursive=True)\n",
"# print(len(pics))\n",
"# pics=glob(path+'sample/valid/**',recursive=True)\n",
"# print(len(pics))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"gen_train=ImageDataGenerator(rotation_range=10., width_shift_range=0.2, \n",
" height_shift_range=0.2, shear_range=0.4, zoom_range=0.2, \n",
" channel_shift_range=6.)\n",
"gen_val=ImageDataGenerator()\n",
"gen_train=ImageDataGenerator()\n",
"batches_train=gen_train.flow_from_directory(path+'train',target_size=(224,224),\n",
" batch_size=64,class_mode='categorical')\n",
"batches_val=gen_val.flow_from_directory(path+'valid',target_size=(224,224),\n",
" batch_size=64,class_mode='categorical',shuffle=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## 3. Train simple neural net"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true,
"scrolled": true
},
"outputs": [],
"source": [
"\n",
"categories=len(glob(path+'train_orig/*'))\n",
"\n",
"model=Sequential([\n",
" BatchNormalization(input_shape=(3,224,224)),\n",
" Flatten(),\n",
" Dense(512,activation='relu'),\n",
" BatchNormalization(),\n",
" Dropout(0.2), \n",
" Dense(categories,activation='softmax')\n",
"])\n",
"stop=EarlyStopping(patience=2)\n",
"model.compile(Adam(lr=1e-4),loss='categorical_crossentropy',metrics=['accuracy'])\n",
"\n",
"\n",
"#model.load_weights('basic_linear.dat')\n",
"def long_train(n):\n",
" model.fit_generator(batches_train,nb_epoch=1,validation_data=batches_val,\n",
" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
" model.optimizer.lr=1e-3\n",
" model.fit_generator(batches_train,nb_epoch=2,validation_data=batches_val,\n",
" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
" model.optimizer.lr=1e-2\n",
" model.fit_generator(batches_train,nb_epoch=1,validation_data=batches_val,\n",
" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
" model.optimizer.lr=1e-4\n",
" model.fit_generator(batches_train,nb_epoch=n,validation_data=batches_val,\n",
" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n",
"#long_train(20)\n",
"#model.load_weights('basic_linear.dat')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# model.optimizer.lr=0.5e-4\n",
"# model.fit_generator(batches_train,nb_epoch=5,validation_data=batches_val,\n",
"# nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## 4. Train simple convnet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"class miniNet():\n",
" def __init__(self):\n",
" self.lr=1e-4\n",
" self.stop=EarlyStopping(patience=2)\n",
" self.get_model()\n",
" def ConvBlock(self,n):\n",
" model=self.model\n",
" model.add(Convolution2D(n,3,3,activation='relu',border_mode='same'))\n",
" model.add(BatchNormalization(axis=1))\n",
" model.add(Dropout(0.1))\n",
" model.add(Convolution2D(n,3,3,activation='relu',border_mode='same'))\n",
" model.add(BatchNormalization(axis=1))\n",
" model.add(Dropout(0.1))\n",
" model.add(MaxPooling2D()) \n",
" def FlatBlock(self):\n",
" model=self.model\n",
" model.add(Flatten())\n",
" model.add(BatchNormalization()) \n",
" model.add(Dense(512,activation='relu'))\n",
" model.add(Dropout(0.6))\n",
" model.add(BatchNormalization())\n",
" model.add(Dense(256,activation='relu'))\n",
" model.add(Dropout(0.6))\n",
" model.add(BatchNormalization()) \n",
" model.add(Dense(categories,activation='softmax'))\n",
" def get_model(self):\n",
" self.model=Sequential()\n",
" self.model.add(BatchNormalization(input_shape=(3,224,224),axis=1)),\n",
" self.ConvBlock(64)\n",
" self.ConvBlock(128)\n",
" self.FlatBlock()\n",
" self.compile()\n",
" return self.model\n",
" def fit(self,train_batch,val_batch,nb_epoch=1):\n",
" \n",
" self.model.fit_generator(train_batch,nb_epoch=nb_epoch,\n",
" validation_data=val_batch,\n",
" nb_val_samples=val_batch.N,\n",
" samples_per_epoch=train_batch.N,verbose=2,callbacks=[self.stop])\n",
" def summary(self):\n",
" self.model.summary()\n",
" def set_lr(self,lr):\n",
" self.lr=lr\n",
" self.model.optimizer.lr=lr\n",
" def compile(self):\n",
" self.model.compile(Adam(lr=self.lr),loss='categorical_crossentropy',metrics=['accuracy'])\n",
" \n",
"#CN=miniNet()\n",
"#CN.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"#CN.model.save_weights('mini_cnn_1.dat')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# CN.set_lr(1e-4)\n",
"# CN.fit(batches_train,batches_val,1)\n",
"\n",
"# CN.set_lr(1e-3)\n",
"# CN.fit(batches_train,batches_val,1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# CN.set_lr(1e-4)\n",
"# CN.fit(batches_train,batches_val,10)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## 5. Train VGG16bn model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from vgg16bn import Vgg16BN\n",
"vgg=Vgg16BN()\n",
"vgg.superfinetune(batches_train)\n",
"def get_mod(batches_train):\n",
" mod=Vgg16BN()\n",
" mod.superfinetune(batches_train)\n",
" return mod"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true,
"scrolled": true
},
"outputs": [],
"source": [
"def train(mo):\n",
" mo.fit(batches_train,batches_val,1)\n",
" mo.model.optimizer.lr=1e-3\n",
" mo.fit(batches_train,batches_val,1)\n",
" mo.model.optimizer.lr=1e-4\n",
" mo.fit(batches_train,batches_val,5)\n",
" mo.model.optimizer.lr=0.25e-4\n",
" mo.fit(batches_train,batches_val,20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"model=vgg.model"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Ensembling"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"models = [get_mod(batches_train) for i in range(2)]\n",
"for i,m in enumerate(models):\n",
" m.model.load_weights(path+'vgg16-'+str(i)+'.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true,
"scrolled": true
},
"outputs": [],
"source": [
"for m in models:\n",
" train(m)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"for i,m in enumerate(models):\n",
" m.model.save_weights(path+'vgg16-'+str(i)+'.pkl')\n",
"gen_test=ImageDataGenerator()\n",
" \n",
"batches_test=gen_test.flow_from_directory(path+'pretest',target_size=(224,224),\n",
" batch_size=64,shuffle=False)\n",
"preds=np.array([m.model.predict_generator(batches_test,val_samples=batches_test.N) for m in models])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"preds=preds.mean(axis=0)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## 6. Test and submit"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"#gen_test=ImageDataGenerator()\n",
"#batches_test=gen_test.flow_from_directory(path+'pretest',target_size=(224,224),\n",
"# batch_size=64,shuffle=False)\n",
"#preds=model.predict_generator(batches_test,val_samples=batches_test.N)\n",
"preds=preds.clip(0.01,0.99)\n",
"def name_batch(i):\n",
" return batches_test.filenames[i].split('/')[-1]\n",
"def string_array(a):\n",
" return ','.join(str(x) for x in a)\n",
"name='sub_vgg2_001_noaugmentation.csv'\n",
"with open(name,'w') as f:\n",
" f.write('image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT\\n')\n",
" for i in range(len(batches_test.filenames)):\n",
" st=name_batch(i)+','+string_array(preds[i])+'\\n'\n",
" f.write(st);\n",
"\n",
" \n",
"#os.system('kg submit '+name+'-m `Basic net, more iterations`')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import IPython.display as dp\n",
"\n",
"dp.display(dp.FileLink(name))\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment