Skip to content

Instantly share code, notes, and snippets.

@akhilstanis
Created September 20, 2016 22:16
Show Gist options
  • Save akhilstanis/67f87a13cbffcb2f9943410c97900f47 to your computer and use it in GitHub Desktop.
Save akhilstanis/67f87a13cbffcb2f9943410c97900f47 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "c6090d32-5b99-4774-8596-db77ec6af374"
}
},
"source": [
"# Tatanic Dataset Assignment"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "4941339c-affb-404a-8bc9-49c0c3c0ea94"
}
},
"outputs": [
{
"data": {
"text/plain": [
"891"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"require 'csv'\n",
"require 'open-uri'\n",
"\n",
"csv = open(\"https://gist.githubusercontent.com/akhilstanislavose/46023ee10af448b9bb6a9656624cd03c/raw/ce0be9103e55f401f895b153ae6fc293cb91c241/titanic.csv\").read\n",
"dataset = CSV.parse(csv, :headers => true)\n",
"\n",
"dataset.count"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "59b5d37d-f935-4930-bf6f-afefe72d77b2"
}
},
"outputs": [
{
"data": {
"text/plain": [
":purity"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def entropy(probablities)\n",
" -1 * probablities.reduce(0.0) { |sum,p| sum += p > 0 ? p * Math.log2(p) : 0 }\n",
"end\n",
"\n",
"def gini(probablities)\n",
" probablities.reduce(0.0) { |sum,p| sum += p * (1 - p) }\n",
"end\n",
"\n",
"def purity(mixtures, &block)\n",
" purity = mixtures.reduce(0.0) do |sum,m|\n",
" size = m.reduce(:+).to_f\n",
" sum += size > 0 ? size * yield(m.collect { |n| n/size }) : 0\n",
" end\n",
" purity / mixtures.flatten.reduce(:+)\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "bb14d4f6-d34e-4adb-b16d-5787f97757cf"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Entropy of dataset = 0.9607079018756469\n",
"Gini of dataset = 0.4730129578614427\n"
]
}
],
"source": [
"died = dataset.select { |e| e['survived'] == '0' }.count\n",
"survived = dataset.select { |e| e['survived'] == '1' }.count\n",
"\n",
"dataset_entropy = purity([[died,survived]]) { |ps| entropy(ps) }\n",
"dataset_gini = purity([[died,survived]]) { |ps| gini(ps) }\n",
"\n",
"puts \"Entropy of dataset = #{dataset_entropy}\"\n",
"puts \"Gini of dataset = #{dataset_gini}\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "991dce32-a1e6-4965-b0c1-5c75ea66f289"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Information Gain after gender split using Entropy = 0.2176601066606143\n",
"Information Gain after gender split using Gini = 0.13964795747285225\n"
]
}
],
"source": [
"survived_female = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' }.count\n",
"survived_male = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' }.count\n",
"\n",
"died_female = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' }.count\n",
"died_male = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' }.count\n",
"\n",
"gender_split = [[survived_female, died_female],[survived_male, died_male]]\n",
"\n",
"gender_split_entropy = purity(gender_split) { |ps| entropy(ps) }\n",
"gender_split_gini = purity(gender_split) { |ps| gini(ps) }\n",
"\n",
"puts \"Information Gain after gender split using Entropy = #{dataset_entropy - gender_split_entropy}\"\n",
"puts \"Information Gain after gender split using Gini = #{dataset_gini - gender_split_gini}\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "d721215f-7c99-4aae-ab13-fc2ce7dad9a5"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Information Gain after pclass split using Entropy = 0.08383104529601149\n",
"Information Gain after pclass split using Gini = 0.05462157677138346\n"
]
}
],
"source": [
"survived_pclass_1 = dataset.select { |e| e['survived'] == '1' && e['pclass'] == '1' }.count\n",
"survived_pclass_2 = dataset.select { |e| e['survived'] == '1' && e['pclass'] == '2' }.count\n",
"survived_pclass_3 = dataset.select { |e| e['survived'] == '1' && e['pclass'] == '3' }.count\n",
"\n",
"died_pclass_1 = dataset.select { |e| e['survived'] == '0' && e['pclass'] == '1' }.count\n",
"died_pclass_2 = dataset.select { |e| e['survived'] == '0' && e['pclass'] == '2' }.count\n",
"died_pclass_3 = dataset.select { |e| e['survived'] == '0' && e['pclass'] == '3' }.count\n",
"\n",
"pclass_split = [\n",
" [survived_pclass_1,died_pclass_1],\n",
" [survived_pclass_2,died_pclass_2],\n",
" [survived_pclass_3,died_pclass_3]\n",
"]\n",
"\n",
"pclass_split_entropy = purity(pclass_split) { |ps| entropy(ps) }\n",
"pclass_split_gini = purity(pclass_split) { |ps| gini(ps) }\n",
"\n",
"puts \"Information Gain after pclass split using Entropy = #{dataset_entropy - pclass_split_entropy}\"\n",
"puts \"Information Gain after pclass split using Gini = #{dataset_gini - pclass_split_gini}\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "d49f3a4d-33cd-41e6-8e9d-4a32ef245246"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embarked types = [\"S\", \"C\", \"Q\", \"NA\"]\n",
"Information Gain after embarked split using Entropy = 0.024047090707960517\n",
"Information Gain after embarked split using Gini = 0.015751498294317823\n"
]
}
],
"source": [
"embarked_types = dataset.collect { |e| e['embarked'] }.uniq\n",
"puts \"Embarked types = #{embarked_types}\"\n",
"\n",
"survived_embarked_S = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'S' }.count\n",
"survived_embarked_C = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'C' }.count\n",
"survived_embarked_Q = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'Q' }.count\n",
"survived_embarked_NA = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'NA' }.count\n",
"\n",
"died_embarked_S = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'S' }.count\n",
"died_embarked_C = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'C' }.count\n",
"died_embarked_Q = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'Q' }.count\n",
"died_embarked_NA = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'NA' }.count\n",
"\n",
"embarked_split = [\n",
" [survived_embarked_S, died_embarked_S],\n",
" [survived_embarked_C, died_embarked_C],\n",
" [survived_embarked_Q, died_embarked_Q],\n",
" [survived_embarked_NA, died_embarked_NA]\n",
"]\n",
"\n",
"embarked_split_entropy = purity(embarked_split) { |ps| entropy(ps) }\n",
"embarked_split_gini = purity(embarked_split) { |ps| gini(ps) }\n",
"\n",
"puts \"Information Gain after embarked split using Entropy = #{dataset_entropy - embarked_split_entropy}\"\n",
"puts \"Information Gain after embarked split using Gini = #{dataset_gini - embarked_split_gini}\""
]
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "eb9152cb-1134-42d1-93e0-aa1bbc423097"
}
},
"source": [
"**IG(gender_split) > IG(pclass_split) > IG(embarked_split)**"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "40583111-b20f-4929-a49e-8b2c475e7d3f"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Information Gain after male-embarked split using Entropy = 0.05942132128706379\n",
"Information Gain after male-embarked split using Gini = 0.033595808771653746\n"
]
}
],
"source": [
"survived_male_embarked_S = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'S' }.count\n",
"survived_male_embarked_C = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'C' }.count\n",
"survived_male_embarked_Q = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'Q' }.count\n",
"survived_male_embarked_NA = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'NA' }.count\n",
"\n",
"died_male_embarked_S = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'S' }.count\n",
"died_male_embarked_C = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'C' }.count\n",
"died_male_embarked_Q = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'Q' }.count\n",
"died_male_embarked_NA = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'NA' }.count\n",
"\n",
"male_embarked_split = [\n",
" [survived_male_embarked_S, died_male_embarked_S],\n",
" [survived_male_embarked_C, died_male_embarked_C],\n",
" [survived_male_embarked_Q, died_male_embarked_Q],\n",
" [survived_male_embarked_NA, died_male_embarked_NA]\n",
"]\n",
"\n",
"male_embarked_split_entropy = purity(male_embarked_split) { |ps| entropy(ps) }\n",
"male_embarked_split_gini = purity(male_embarked_split) { |ps| gini(ps) }\n",
"\n",
"puts \"Information Gain after male-embarked split using Entropy = #{gender_split_entropy - male_embarked_split_entropy}\"\n",
"puts \"Information Gain after male-embarked split using Gini = #{gender_split_gini - male_embarked_split_gini}\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "95b5f7ff-b45d-4dbd-975c-7dcbca8f7442"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Information Gain after male-pclass split using Entropy = 0.08056051348939974\n",
"Information Gain after male-pclass split using Gini = 0.04442316613547043\n"
]
}
],
"source": [
"survived_male_pclass_1 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['pclass'] == '1' }.count\n",
"survived_male_pclass_2 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['pclass'] == '2' }.count\n",
"survived_male_pclass_3 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['pclass'] == '3' }.count\n",
"\n",
"died_male_pclass_1 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['pclass'] == '1' }.count\n",
"died_male_pclass_2 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['pclass'] == '2' }.count\n",
"died_male_pclass_3 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['pclass'] == '3' }.count\n",
"\n",
"male_pclass_split = [\n",
" [survived_male_pclass_1,died_male_pclass_1],\n",
" [survived_male_pclass_2,died_male_pclass_2],\n",
" [survived_male_pclass_3,died_male_pclass_3]\n",
"]\n",
"\n",
"male_pclass_split_entropy = purity(male_pclass_split) { |ps| entropy(ps) }\n",
"male_pclass_split_gini = purity(male_pclass_split) { |ps| gini(ps) }\n",
"\n",
"puts \"Information Gain after male-pclass split using Entropy = #{gender_split_entropy - male_pclass_split_entropy}\"\n",
"puts \"Information Gain after male-pclass split using Gini = #{gender_split_gini - male_pclass_split_gini}\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "72cd60e6-d68c-4676-9e14-b2480668fa42"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Information Gain after female-embarked split using Entropy = -0.052903841414234765\n",
"Information Gain after female-embarked split using Gini = -0.0366266354137858\n"
]
}
],
"source": [
"survived_female_embarked_S = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'S' }.count\n",
"survived_female_embarked_C = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'C' }.count\n",
"survived_female_embarked_Q = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'Q' }.count\n",
"survived_female_embarked_NA = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'NA' }.count\n",
"\n",
"died_female_embarked_S = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'S' }.count\n",
"died_female_embarked_C = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'C' }.count\n",
"died_female_embarked_Q = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'Q' }.count\n",
"died_female_embarked_NA = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'NA' }.count\n",
"\n",
"female_embarked_split = [\n",
" [survived_female_embarked_S, died_female_embarked_S],\n",
" [survived_female_embarked_C, died_female_embarked_C],\n",
" [survived_female_embarked_Q, died_female_embarked_Q],\n",
" [survived_female_embarked_NA, died_female_embarked_NA]\n",
"]\n",
"\n",
"female_embarked_split_entropy = purity(female_embarked_split) { |ps| entropy(ps) }\n",
"female_embarked_split_gini = purity(female_embarked_split) { |ps| gini(ps) }\n",
"\n",
"puts \"Information Gain after female-embarked split using Entropy = #{gender_split_entropy - female_embarked_split_entropy}\"\n",
"puts \"Information Gain after female-embarked split using Gini = #{gender_split_gini - female_embarked_split_gini}\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"nbpresent": {
"id": "47bdbb1f-6df0-4030-b568-605093a81ad6"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Information Gain after female-pclass split using Entropy = 0.12696481072875554\n",
"Information Gain after female-pclass split using Gini = 0.05036773218080309\n"
]
}
],
"source": [
"survived_female_pclass_1 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['pclass'] == '1' }.count\n",
"survived_female_pclass_2 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['pclass'] == '2' }.count\n",
"survived_female_pclass_3 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['pclass'] == '3' }.count\n",
"\n",
"died_female_pclass_1 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['pclass'] == '1' }.count\n",
"died_female_pclass_2 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['pclass'] == '2' }.count\n",
"died_female_pclass_3 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['pclass'] == '3' }.count\n",
"\n",
"female_pclass_split = [\n",
" [survived_female_pclass_1,died_female_pclass_1],\n",
" [survived_female_pclass_2,died_female_pclass_2],\n",
" [survived_female_pclass_3,died_female_pclass_3]\n",
"]\n",
"\n",
"female_pclass_split_entropy = purity(female_pclass_split) { |ps| entropy(ps) }\n",
"female_pclass_split_gini = purity(female_pclass_split) { |ps| gini(ps) }\n",
"\n",
"puts \"Information Gain after female-pclass split using Entropy = #{gender_split_entropy - female_pclass_split_entropy}\"\n",
"puts \"Information Gain after female-pclass split using Gini = #{gender_split_gini - female_pclass_split_gini}\""
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Ruby 2.3.0",
"language": "ruby",
"name": "ruby"
},
"language_info": {
"file_extension": ".rb",
"mimetype": "application/x-ruby",
"name": "ruby",
"version": "2.3.0"
},
"nbpresent": {
"slides": {},
"themes": {
"default": "386e519f-a510-4ddd-a686-393bd47e3404",
"theme": {}
}
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment