Skip to content

Instantly share code, notes, and snippets.

@kanhua
Created October 23, 2014 15:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kanhua/bf4a75884f3cb2934364 to your computer and use it in GitHub Desktop.
Save kanhua/bf4a75884f3cb2934364 to your computer and use it in GitHub Desktop.
Submit Kaggle Titanic results (not optimised!)
{
"metadata": {
"language": "Julia",
"name": "",
"signature": "sha256:999e48e1cfeb072e56d4089a89783a9483f6c014acc704990d74ba7079a57434"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"using Gadfly\n",
"using DataFrames\n",
"using DecisionTree"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"function cleanData(filename)\n",
" df=readtable(filename)\n",
" pool!(df,[:Sex])\n",
"\tpool!(df,[:Pclass])\n",
"\taverageAge=mean(df[!isna(df[:Age]),:Age])\n",
"\tdf[:Age]=array(df[:Age],averageAge)\n",
" \n",
" if any(isna(df[:Fare]))\n",
" averageFare=mean(df[!isna(df[:Fare]),:Fare])\n",
" df[:Fare]=array(df[:Fare],averageFare)\n",
" end\n",
"\tdf[:Embarked]=array(df[:Embarked],utf8(\"S\"))\n",
"\tpool!(df,[:Embarked])\n",
"\tnewdata=df[:,[:Pclass,:Age,:Sex,:SibSp,:Parch,:Fare,:Embarked]]\n",
" \n",
" iddata=df[:,[:PassengerId]]\n",
" return newdata,iddata\n",
"end"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"cleanData (generic function with 1 method)"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"function readylabel(filename)\n",
" df=readtable(filename)\n",
"\tpool!(df,[:Survived])\n",
"\n",
"\treturn df[:Survived]\n",
"end"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"readylabel (generic function with 1 method)"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"xTrain,idTrain=cleanData(\"train.csv\")\n",
"xTest,idTest=cleanData(\"test.csv\")\n",
"yTrain=readylabel(\"train.csv\")"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"891-element PooledDataArray{Int64,Uint8,1}:\n",
" 0\n",
" 1\n",
" 1\n",
" 1\n",
" 0\n",
" 0\n",
" 0\n",
" 0\n",
" 1\n",
" 1\n",
" 1\n",
" 1\n",
" 0\n",
" \u22ee\n",
" 1\n",
" 1\n",
" 0\n",
" 0\n",
" 0\n",
" 0\n",
" 0\n",
" 0\n",
" 1\n",
" 0\n",
" 1\n",
" 0"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"describe(xTest)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Pclass\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Min 1.0"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"1st Qu. 1.0\n",
"Median 3.0\n",
"Mean 2.2655502392344498\n",
"3rd Qu. 3.0\n",
"Max 3.0\n",
"NAs 0\n",
"NA% 0.0%\n",
"\n",
"Age\n",
"Min 0.17\n",
"1st Qu. 23.0\n",
"Median 30.272590361445783\n",
"Mean 30.272590361445793\n",
"3rd Qu. 35.75\n",
"Max 76.0\n",
"NAs 0\n",
"NA% 0.0%\n",
"\n",
"Sex\n",
"Length 418"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Type Pooled UTF8String\n",
"NAs 0\n",
"NA% 0.0%\n",
"Unique 2\n",
"\n",
"SibSp\n",
"Min 0.0\n",
"1st Qu. 0.0\n",
"Median 0.0\n",
"Mean 0.4473684210526316\n",
"3rd Qu. 1.0\n",
"Max 8.0\n",
"NAs 0\n",
"NA% 0.0%\n",
"\n",
"Parch\n",
"Min 0.0\n",
"1st Qu. 0.0\n",
"Median 0.0\n",
"Mean 0.3923444976076555\n",
"3rd Qu. 0.0\n",
"Max 9.0\n",
"NAs 0\n",
"NA% 0.0%\n",
"\n",
"Fare\n",
"Min 0.0\n",
"1st Qu. 7.8958\n",
"Median 14.4542\n",
"Mean 35.627188489208635\n",
"3rd Qu. 31.5\n",
"Max 512.3292\n",
"NAs 0\n",
"NA% 0.0%\n",
"\n",
"Embarked\n",
"Length 418\n",
"Type Pooled UTF8String\n",
"NAs 0\n",
"NA% 0.0%\n",
"Unique 3\n",
"\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"yTrain=array(yTrain)\n",
"xTrain=array(xTrain)\n",
"xTest=array(xTest)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"418x7 Array{Any,2}:\n",
" 3 34.5 \"male\" 0 0 7.8292 \"Q\"\n",
" 3 47.0 \"female\" 1 0 7.0 \"S\"\n",
" 2 62.0 \"male\" 0 0 9.6875 \"Q\"\n",
" 3 27.0 \"male\" 0 0 8.6625 \"S\"\n",
" 3 22.0 \"female\" 1 1 12.2875 \"S\"\n",
" 3 14.0 \"male\" 0 0 9.225 \"S\"\n",
" 3 30.0 \"female\" 0 0 7.6292 \"Q\"\n",
" 2 26.0 \"male\" 1 1 29.0 \"S\"\n",
" 3 18.0 \"female\" 0 0 7.2292 \"C\"\n",
" 3 21.0 \"male\" 2 0 24.15 \"S\"\n",
" 3 30.2726 \"male\" 0 0 7.8958 \"S\"\n",
" 1 46.0 \"male\" 0 0 26.0 \"S\"\n",
" 1 23.0 \"female\" 1 0 82.2667 \"S\"\n",
" \u22ee \u22ee \n",
" 2 23.0 \"male\" 1 0 10.5 \"S\"\n",
" 1 50.0 \"male\" 1 1 211.5 \"C\"\n",
" 3 30.2726 \"female\" 0 0 7.7208 \"Q\"\n",
" 3 3.0 \"female\" 1 1 13.775 \"S\"\n",
" 3 30.2726 \"female\" 0 0 7.75 \"Q\"\n",
" 1 37.0 \"female\" 1 0 90.0 \"Q\"\n",
" 3 28.0 \"female\" 0 0 7.775 \"S\"\n",
" 3 30.2726 \"male\" 0 0 8.05 \"S\"\n",
" 1 39.0 \"female\" 0 0 108.9 \"C\"\n",
" 3 38.5 \"male\" 0 0 7.25 \"S\"\n",
" 3 30.2726 \"male\" 0 0 8.05 \"S\"\n",
" 3 30.2726 \"male\" 1 1 22.3583 \"C\""
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"accuracy = nfoldCV_forest(yTrain, xTrain, 5, 20, 4, 0.7);"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Fold "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1\n",
"Classes: {0,1}\n",
"Matrix: \n",
"["
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"115 14\n",
" 27 66]\n",
"Accuracy: 0.8153153153153153\n",
"Kappa: 0.6131089007906146\n",
"\n",
"Fold "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"2\n",
"Classes: {0,1}\n",
"Matrix: \n",
"[122 17\n",
" 26 57]\n",
"Accuracy: 0.8063063063063063\n",
"Kappa: 0.5770491803278688\n",
"\n",
"Fold "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"3\n",
"Classes: {0,1}\n",
"Matrix: \n",
"[115 18\n",
" 24 65]\n",
"Accuracy: 0.8108108108108109\n",
"Kappa: 0.6017086715079027\n",
"\n",
"Fold "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"4\n",
"Classes: {0,1}\n",
"Matrix: \n",
"[125 21\n",
" 20 56]\n",
"Accuracy: 0.8153153153153153\n",
"Kappa: 0.591141856077621\n",
"\n",
"Mean Accuracy: 0.8119369369369369\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Build random forest model"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"model = build_forest(yTrain, xTrain, 5, 20, 0.7)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"Ensemble of Decision Trees\n",
"Trees: 20\n",
"Avg Leaves: 108.9\n",
"Avg Depth: 18.75"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"predy=apply_forest(model,xTest)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"418-element Array{Any,1}:\n",
" 0\n",
" 0\n",
" 0\n",
" 0\n",
" 0\n",
" 0\n",
" 1\n",
" 0\n",
" 1\n",
" 0\n",
" 0\n",
" 0\n",
" 1\n",
" \u22ee\n",
" 0\n",
" 0\n",
" 1\n",
" 1\n",
" 1\n",
" 1\n",
" 1\n",
" 0\n",
" 1\n",
" 0\n",
" 0\n",
" 1"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"predydf=DataArray(Survived=predy)\n",
"predydf=int(predydf[:,1])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "LoadError",
"evalue": "function DataArray does not accept keyword arguments\nwhile loading In[10], in expression starting on line 1",
"output_type": "pyerr",
"traceback": [
"function DataArray does not accept keyword arguments\nwhile loading In[10], in expression starting on line 1",
""
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"resultdf=[idTest predydf]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "LoadError",
"evalue": "predydf not defined\nwhile loading In[11], in expression starting on line 1",
"output_type": "pyerr",
"traceback": [
"predydf not defined\nwhile loading In[11], in expression starting on line 1",
""
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"writedlm(\"output.csv\",[array(idTest) predy],',')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"convert(Int64,\"3\")"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "LoadError",
"evalue": "`convert` has no method matching convert(::Type{Int64}, ::ASCIIString)\nwhile loading In[14], in expression starting on line 1",
"output_type": "pyerr",
"traceback": [
"`convert` has no method matching convert(::Type{Int64}, ::ASCIIString)\nwhile loading In[14], in expression starting on line 1",
"",
" in convert at base.jl:13"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"int(\"3\")"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
"3"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"typeof(\"3\")==ASCIIString"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 18,
"text": [
"true"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment