Last active
March 4, 2017 18:22
-
-
Save artirj/254d8865bb70664244f1b3111d7518d4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"# Fisheries kaggle competition" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"## 1. Examine data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true, | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import matplotlib.image as mpimg\n", | |
"from glob import glob\n", | |
"import numpy as np\n", | |
"import seaborn as sns\n", | |
"import keras\n", | |
"from keras.models import Sequential\n", | |
"from keras.layers import Dense,BatchNormalization,Convolution2D,Flatten,Dropout, MaxPooling2D\n", | |
"from keras.utils.np_utils import to_categorical\n", | |
"from keras.preprocessing.image import ImageDataGenerator\n", | |
"from keras.optimizers import Adam\n", | |
"from keras.callbacks import EarlyStopping\n", | |
"def onehot(array):\n", | |
" return to_categorical(array)\n", | |
"USE_SAMPLE=False\n", | |
"if USE_SAMPLE:\n", | |
" path='data/sample/'\n", | |
"else:\n", | |
" path='data/'\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# print(len(glob(path+'train_orig/**',recursive=True)))\n", | |
"# print(len(glob(path+'test/**',recursive=True)))\n", | |
"categories=8\n", | |
"# print(categories)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"### Plots" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#First, let's have a look at the data\n", | |
"def plot_four(path):\n", | |
" imgs=np.random.permutation(glob(path+'*'))[:4]\n", | |
" plt.figure(figsize=(24,12))\n", | |
" \n", | |
" for i,img in enumerate(imgs):\n", | |
" plt.subplot(2,2,i+1,)\n", | |
" #plt.tight_layout()\n", | |
" plt.imshow(mpimg.imread(imgs[i]))\n", | |
" plt.gca().grid(False)\n", | |
" plt.gca().get_yaxis().set_ticks([])\n", | |
" plt.gca().get_xaxis().set_ticks([])\n", | |
" plt.title(imgs[i],fontdict={'fontsize':20})\n", | |
" plt.show()\n", | |
"plot_four(path+'test/')\n", | |
"plot_four(path+'train/ALB/')\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"### Shape " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#Now, let's see the size of the data. I only do test and assume other data is similar\n", | |
"def get_data_shape(path):\n", | |
" imgs=np.random.permutation(glob(path+'*'))\n", | |
" lens=[]\n", | |
" for img in imgs:\n", | |
" lens.append(mpimg.imread(img).shape)\n", | |
" return np.array(lens)\n", | |
"lens_test=get_data_shape(path+'test/')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# def count_rows(unique_items,array):\n", | |
"# dic={}\n", | |
"# for item in unique_items:\n", | |
"# dic[str(item)]=(array==item).sum()\n", | |
"# return dic\n", | |
"# def unique_rows(a, **kwargs):\n", | |
"\n", | |
"# rowtype = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))\n", | |
"# b = np.ascontiguousarray(a).view(rowtype)\n", | |
"# return_index = kwargs.pop('return_index', False)\n", | |
"# out = np.unique(b, return_index=True, **kwargs)\n", | |
"# idx = out[1]\n", | |
"# uvals = a[idx]\n", | |
"# return uvals\n", | |
"# shapes=count_rows(unique_rows(lens_test),lens_test)\n", | |
"# from pprint import pprint\n", | |
"# pprint(shapes)\n", | |
"# plt.figure(figsize=(12,6))\n", | |
"# plt.bar(range(len(shapes)),shapes.values());\n", | |
"# plt.xticks(range(len(shapes)),[i[:9]+']' for i in shapes.keys()]);\n", | |
"# # Would it make sense to use different nets? Maybe!" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"## 2. Create sample " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import shutil\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"#path=os.path.realpath('')+'/'+path\n", | |
"# cats/dogs\n", | |
"pre_run=1\n", | |
"samplesize=0.1\n", | |
"prop_train=0.9\n", | |
"\n", | |
"\n", | |
"if(pre_run==1):\n", | |
" shutil.rmtree(path+'sample',ignore_errors=1)\n", | |
" shutil.rmtree(path+'valid',ignore_errors=1)\n", | |
" shutil.rmtree(path+'train',ignore_errors=1)\n", | |
" os.mkdir(path+'sample')\n", | |
" os.mkdir(path+'valid')\n", | |
" os.mkdir(path+'train')\n", | |
" os.mkdir(path+'sample/train')\n", | |
" os.mkdir(path+'sample/test')\n", | |
" os.mkdir(path+'sample/valid')\n", | |
" dirs=glob(path+'train_orig/*')\n", | |
" for i in dirs: \n", | |
" subdir=i.split('/')[-1]\n", | |
" os.mkdir(path+'sample/train/'+subdir)\n", | |
" os.mkdir(path+'train/'+subdir)\n", | |
" os.mkdir(path+'sample/valid/'+subdir)\n", | |
" os.mkdir(path+'valid/'+subdir)\n", | |
" train,valid=train_test_split(os.listdir(i),train_size=prop_train,random_state=42)\n", | |
" count=0\n", | |
" for j in valid:\n", | |
" if count<=samplesize*len(valid):\n", | |
" shutil.copy(i+'/'+j,path+'sample/valid/'+subdir+'/'+j)\n", | |
" count+=1\n", | |
" shutil.copy(i+'/'+j,path+'valid/'+subdir+'/'+j)\n", | |
" count=0\n", | |
" for j in train:\n", | |
" if count<=samplesize*len(train):\n", | |
" shutil.copy(i+'/'+j,path+'sample/train/'+subdir+'/'+j)\n", | |
" count+=1\n", | |
" shutil.copy(i+'/'+j,path+'train/'+subdir+'/'+j)\n", | |
" test_imgs=glob(path+'test/*')\n", | |
" np.random.seed(42)\n", | |
" sample_test=np.random.permutation(test_imgs)[:round(len(test_imgs)*samplesize)] \n", | |
" for i in sample_test:\n", | |
" shutil.copy(i,path+'sample/test/'+i.split('/')[-1])\n", | |
" \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# pics=glob(path+'train/**',recursive=True)\n", | |
"# print(len(pics))\n", | |
"# pics=glob(path+'valid/**',recursive=True)\n", | |
"# print(len(pics))\n", | |
"# pics=glob(path+'sample/train/**',recursive=True)\n", | |
"# print(len(pics))\n", | |
"# pics=glob(path+'sample/valid/**',recursive=True)\n", | |
"# print(len(pics))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"gen_train=ImageDataGenerator(rotation_range=10., width_shift_range=0.2, \n", | |
" height_shift_range=0.2, shear_range=0.4, zoom_range=0.2, \n", | |
" channel_shift_range=6.)\n", | |
"gen_val=ImageDataGenerator()\n", | |
"gen_train=ImageDataGenerator()\n", | |
"batches_train=gen_train.flow_from_directory(path+'train',target_size=(224,224),\n", | |
" batch_size=64,class_mode='categorical')\n", | |
"batches_val=gen_val.flow_from_directory(path+'valid',target_size=(224,224),\n", | |
" batch_size=64,class_mode='categorical',shuffle=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"## 3. Train simple neural net" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"categories=len(glob(path+'train_orig/*'))\n", | |
"\n", | |
"model=Sequential([\n", | |
" BatchNormalization(input_shape=(3,224,224)),\n", | |
" Flatten(),\n", | |
" Dense(512,activation='relu'),\n", | |
" BatchNormalization(),\n", | |
" Dropout(0.2), \n", | |
" Dense(categories,activation='softmax')\n", | |
"])\n", | |
"stop=EarlyStopping(patience=2)\n", | |
"model.compile(Adam(lr=1e-4),loss='categorical_crossentropy',metrics=['accuracy'])\n", | |
"\n", | |
"\n", | |
"#model.load_weights('basic_linear.dat')\n", | |
"def long_train(n):\n", | |
" model.fit_generator(batches_train,nb_epoch=1,validation_data=batches_val,\n", | |
" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n", | |
" model.optimizer.lr=1e-3\n", | |
" model.fit_generator(batches_train,nb_epoch=2,validation_data=batches_val,\n", | |
" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n", | |
" model.optimizer.lr=1e-2\n", | |
" model.fit_generator(batches_train,nb_epoch=1,validation_data=batches_val,\n", | |
" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n", | |
" model.optimizer.lr=1e-4\n", | |
" model.fit_generator(batches_train,nb_epoch=n,validation_data=batches_val,\n", | |
" nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n", | |
"#long_train(20)\n", | |
"#model.load_weights('basic_linear.dat')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# model.optimizer.lr=0.5e-4\n", | |
"# model.fit_generator(batches_train,nb_epoch=5,validation_data=batches_val,\n", | |
"# nb_val_samples=batches_val.N,samples_per_epoch=batches_train.N,callbacks=[stop],verbose=2)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"## 4. Train simple convnet" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"class miniNet():\n", | |
" def __init__(self):\n", | |
" self.lr=1e-4\n", | |
" self.stop=EarlyStopping(patience=2)\n", | |
" self.get_model()\n", | |
" def ConvBlock(self,n):\n", | |
" model=self.model\n", | |
" model.add(Convolution2D(n,3,3,activation='relu',border_mode='same'))\n", | |
" model.add(BatchNormalization(axis=1))\n", | |
" model.add(Dropout(0.1))\n", | |
" model.add(Convolution2D(n,3,3,activation='relu',border_mode='same'))\n", | |
" model.add(BatchNormalization(axis=1))\n", | |
" model.add(Dropout(0.1))\n", | |
" model.add(MaxPooling2D()) \n", | |
" def FlatBlock(self):\n", | |
" model=self.model\n", | |
" model.add(Flatten())\n", | |
" model.add(BatchNormalization()) \n", | |
" model.add(Dense(512,activation='relu'))\n", | |
" model.add(Dropout(0.6))\n", | |
" model.add(BatchNormalization())\n", | |
" model.add(Dense(256,activation='relu'))\n", | |
" model.add(Dropout(0.6))\n", | |
" model.add(BatchNormalization()) \n", | |
" model.add(Dense(categories,activation='softmax'))\n", | |
" def get_model(self):\n", | |
" self.model=Sequential()\n", | |
" self.model.add(BatchNormalization(input_shape=(3,224,224),axis=1)),\n", | |
" self.ConvBlock(64)\n", | |
" self.ConvBlock(128)\n", | |
" self.FlatBlock()\n", | |
" self.compile()\n", | |
" return self.model\n", | |
" def fit(self,train_batch,val_batch,nb_epoch=1):\n", | |
" \n", | |
" self.model.fit_generator(train_batch,nb_epoch=nb_epoch,\n", | |
" validation_data=val_batch,\n", | |
" nb_val_samples=val_batch.N,\n", | |
" samples_per_epoch=train_batch.N,verbose=2,callbacks=[self.stop])\n", | |
" def summary(self):\n", | |
" self.model.summary()\n", | |
" def set_lr(self,lr):\n", | |
" self.lr=lr\n", | |
" self.model.optimizer.lr=lr\n", | |
" def compile(self):\n", | |
" self.model.compile(Adam(lr=self.lr),loss='categorical_crossentropy',metrics=['accuracy'])\n", | |
" \n", | |
"#CN=miniNet()\n", | |
"#CN.summary()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#CN.model.save_weights('mini_cnn_1.dat')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# CN.set_lr(1e-4)\n", | |
"# CN.fit(batches_train,batches_val,1)\n", | |
"\n", | |
"# CN.set_lr(1e-3)\n", | |
"# CN.fit(batches_train,batches_val,1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# CN.set_lr(1e-4)\n", | |
"# CN.fit(batches_train,batches_val,10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"## 5. Train VGG16bn model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from vgg16bn import Vgg16BN\n", | |
"vgg=Vgg16BN()\n", | |
"vgg.superfinetune(batches_train)\n", | |
"def get_mod(batches_train):\n", | |
" mod=Vgg16BN()\n", | |
" mod.superfinetune(batches_train)\n", | |
" return mod" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def train(mo):\n", | |
" mo.fit(batches_train,batches_val,1)\n", | |
" mo.model.optimizer.lr=1e-3\n", | |
" mo.fit(batches_train,batches_val,1)\n", | |
" mo.model.optimizer.lr=1e-4\n", | |
" mo.fit(batches_train,batches_val,5)\n", | |
" mo.model.optimizer.lr=0.25e-4\n", | |
" mo.fit(batches_train,batches_val,20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"model=vgg.model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"### Ensembling" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"models = [get_mod(batches_train) for i in range(2)]\n", | |
"for i,m in enumerate(models):\n", | |
" m.model.load_weights(path+'vgg16-'+str(i)+'.pkl')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for m in models:\n", | |
" train(m)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for i,m in enumerate(models):\n", | |
" m.model.save_weights(path+'vgg16-'+str(i)+'.pkl')\n", | |
"gen_test=ImageDataGenerator()\n", | |
" \n", | |
"batches_test=gen_test.flow_from_directory(path+'pretest',target_size=(224,224),\n", | |
" batch_size=64,shuffle=False)\n", | |
"preds=np.array([m.model.predict_generator(batches_test,val_samples=batches_test.N) for m in models])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"preds=preds.mean(axis=0)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"## 6. Test and submit" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#gen_test=ImageDataGenerator()\n", | |
"#batches_test=gen_test.flow_from_directory(path+'pretest',target_size=(224,224),\n", | |
"# batch_size=64,shuffle=False)\n", | |
"#preds=model.predict_generator(batches_test,val_samples=batches_test.N)\n", | |
"preds=preds.clip(0.01,0.99)\n", | |
"def name_batch(i):\n", | |
" return batches_test.filenames[i].split('/')[-1]\n", | |
"def string_array(a):\n", | |
" return ','.join(str(x) for x in a)\n", | |
"name='sub_vgg2_001_noaugmentation.csv'\n", | |
"with open(name,'w') as f:\n", | |
" f.write('image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT\\n')\n", | |
" for i in range(len(batches_test.filenames)):\n", | |
" st=name_batch(i)+','+string_array(preds[i])+'\\n'\n", | |
" f.write(st);\n", | |
"\n", | |
" \n", | |
"#os.system('kg submit '+name+'-m `Basic net, more iterations`')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import IPython.display as dp\n", | |
"\n", | |
"dp.display(dp.FileLink(name))\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [conda root]", | |
"language": "python", | |
"name": "conda-root-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment