Skip to content

Instantly share code, notes, and snippets.

@decisionstats
Created November 3, 2016 10:16
Show Gist options
  • Save decisionstats/0a752d23e94708c6ddbaea478ecd9a81 to your computer and use it in GitHub Desktop.
Save decisionstats/0a752d23e94708c6ddbaea478ecd9a81 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://s3.amazonaws.com/quandl-static-content/Documents/Quandl+-+Pandas,+SciPy,+NumPy+Cheat+Sheet.pdf\n",
"Quandl Cheat Sheet"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import scipy as sp\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n",
" 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])"
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.arange(30) #arange prints numbers from 0 to number specified"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"a1=np.arange(30)"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"435"
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.sum(a1) #sum of array"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0, 1, 2, 3, 4],\n",
" [ 5, 6, 7, 8, 9],\n",
" [10, 11, 12, 13, 14],\n",
" [15, 16, 17, 18, 19],\n",
" [20, 21, 22, 23, 24],\n",
" [25, 26, 27, 28, 29]])"
]
},
"execution_count": 147,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = a1.reshape(6,5) #reshapes array in rows,columns format\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[[ 0, 1, 2, 3, 4],\n",
" [ 5, 6, 7, 8, 9]],\n",
"\n",
" [[10, 11, 12, 13, 14],\n",
" [15, 16, 17, 18, 19]],\n",
"\n",
" [[20, 21, 22, 23, 24],\n",
" [25, 26, 27, 28, 29]]])"
]
},
"execution_count": 148,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a1.reshape(3,2,5)"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[[ 0, 10, 20],\n",
" [ 5, 15, 25]],\n",
"\n",
" [[ 1, 11, 21],\n",
" [ 6, 16, 26]],\n",
"\n",
" [[ 2, 12, 22],\n",
" [ 7, 17, 27]],\n",
"\n",
" [[ 3, 13, 23],\n",
" [ 8, 18, 28]],\n",
"\n",
" [[ 4, 14, 24],\n",
" [ 9, 19, 29]]])"
]
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b = a1.reshape(3,2,5).swapaxes(0,2) #swaping axes\n",
"b"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[[ 0, 10, 20],\n",
" [ 1, 11, 21],\n",
" [ 2, 12, 22],\n",
" [ 3, 13, 23],\n",
" [ 4, 14, 24]],\n",
"\n",
" [[ 5, 15, 25],\n",
" [ 6, 16, 26],\n",
" [ 7, 17, 27],\n",
" [ 8, 18, 28],\n",
" [ 9, 19, 29]]])"
]
},
"execution_count": 151,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b.swapaxes(0,1)"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"14.5"
]
},
"execution_count": 153,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.mean(a)"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"8.6554414483991895"
]
},
"execution_count": 154,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.std(a)"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24\n",
" 25 26 27 28 29]\n",
"[0 1 2 3 4 5 6 7 8 9]\n"
]
}
],
"source": [
"f=np.arange(30)\n",
"g=np.arange(10)\n",
"print(f)\n",
"print(g)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html \n",
"convolution is a mathematical operation on two functions (f and g); it produces a third function, that is typically viewed as a modified version of one of the original functions, giving the integral of the pointwise multiplication of the two functions as a function of the amount that one of the original functions is translated. "
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0 0 1 4 10 20 35 56 84 120 165 210 255 300 345\n",
" 390 435 480 525 570 615 660 705 750 795 840 885 930 975 1020\n",
" 1065 1080 1064 1016 935 820 670 484 261]\n"
]
}
],
"source": [
"c=np.convolve(f,g)\n",
"print(c)"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(39,)"
]
},
"execution_count": 160,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://docs.scipy.org/doc/numpy/reference/generated/numpy.dot.html For 2-D arrays it is equivalent to matrix multiplication, and for 1-D arrays to inner product of vectors (without complex conjugation). For N dimensions it is a sum product over the last axis of a and the second-to-last of b:"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n",
" 17, 18, 19],\n",
" [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,\n",
" 37, 38, 39],\n",
" [40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,\n",
" 57, 58, 59],\n",
" [60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,\n",
" 77, 78, 79],\n",
" [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96,\n",
" 97, 98, 99]])"
]
},
"execution_count": 179,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = np.arange(100).reshape((5,20))\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0 1 2 3 4 5 6 7 8 9]\n",
" [ 10 11 12 13 14 15 16 17 18 19]\n",
" [ 20 21 22 23 24 25 26 27 28 29]\n",
" [ 30 31 32 33 34 35 36 37 38 39]\n",
" [ 40 41 42 43 44 45 46 47 48 49]\n",
" [ 50 51 52 53 54 55 56 57 58 59]\n",
" [ 60 61 62 63 64 65 66 67 68 69]\n",
" [ 70 71 72 73 74 75 76 77 78 79]\n",
" [ 80 81 82 83 84 85 86 87 88 89]\n",
" [ 90 91 92 93 94 95 96 97 98 99]\n",
" [100 101 102 103 104 105 106 107 108 109]\n",
" [110 111 112 113 114 115 116 117 118 119]\n",
" [120 121 122 123 124 125 126 127 128 129]\n",
" [130 131 132 133 134 135 136 137 138 139]\n",
" [140 141 142 143 144 145 146 147 148 149]\n",
" [150 151 152 153 154 155 156 157 158 159]\n",
" [160 161 162 163 164 165 166 167 168 169]\n",
" [170 171 172 173 174 175 176 177 178 179]\n",
" [180 181 182 183 184 185 186 187 188 189]\n",
" [190 191 192 193 194 195 196 197 198 199]]\n"
]
}
],
"source": [
"b = np.arange(200).reshape(20,10)\n",
"print(b)"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 24700, 24890, 25080, 25270, 25460, 25650, 25840, 26030,\n",
" 26220, 26410],\n",
" [ 62700, 63290, 63880, 64470, 65060, 65650, 66240, 66830,\n",
" 67420, 68010],\n",
" [100700, 101690, 102680, 103670, 104660, 105650, 106640, 107630,\n",
" 108620, 109610],\n",
" [138700, 140090, 141480, 142870, 144260, 145650, 147040, 148430,\n",
" 149820, 151210],\n",
" [176700, 178490, 180280, 182070, 183860, 185650, 187440, 189230,\n",
" 191020, 192810]])"
]
},
"execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.dot(a, b)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PANDAS"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"diamonds =pd.read_csv(\"https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv \")"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.frame.DataFrame"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(diamonds)"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 0.23, 'Ideal', ..., 3.95, 3.98, 2.43],\n",
" [2, 0.21, 'Premium', ..., 3.89, 3.84, 2.31],\n",
" [3, 0.23, 'Good', ..., 4.05, 4.07, 2.31],\n",
" ..., \n",
" [53938, 0.7, 'Very Good', ..., 5.66, 5.68, 3.56],\n",
" [53939, 0.86, 'Premium', ..., 6.15, 6.12, 3.74],\n",
" [53940, 0.75, 'Ideal', ..., 5.83, 5.87, 3.64]], dtype=object)"
]
},
"execution_count": 184,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamonds.values"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"RangeIndex(start=0, stop=53940, step=1)"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamonds.index"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',\n",
" 'price', 'x', 'y', 'z'],\n",
" dtype='object')"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamonds.columns"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',\n",
" 'z'],\n",
" dtype='object')"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamonds.columns[1:]"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"caratseries=pd.Series(diamonds[\"carat\"],index=diamonds.index)"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.series.Series"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(caratseries)"
]
},
{
"cell_type": "code",
"execution_count": 190,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 0.23\n",
"1 0.21\n",
"2 0.23\n",
"3 0.29\n",
"4 0.31\n",
"5 0.24\n",
"6 0.24\n",
"7 0.26\n",
"8 0.22\n",
"9 0.23\n",
"10 0.30\n",
"11 0.23\n",
"12 0.22\n",
"13 0.31\n",
"14 0.20\n",
"15 0.32\n",
"16 0.30\n",
"17 0.30\n",
"18 0.30\n",
"19 0.30\n",
"20 0.30\n",
"21 0.23\n",
"22 0.23\n",
"23 0.31\n",
"24 0.31\n",
"25 0.23\n",
"26 0.24\n",
"27 0.30\n",
"28 0.23\n",
"29 0.23\n",
" ... \n",
"53910 0.70\n",
"53911 0.57\n",
"53912 0.61\n",
"53913 0.80\n",
"53914 0.84\n",
"53915 0.77\n",
"53916 0.74\n",
"53917 0.90\n",
"53918 0.76\n",
"53919 0.76\n",
"53920 0.70\n",
"53921 0.70\n",
"53922 0.70\n",
"53923 0.73\n",
"53924 0.73\n",
"53925 0.79\n",
"53926 0.71\n",
"53927 0.79\n",
"53928 0.79\n",
"53929 0.71\n",
"53930 0.71\n",
"53931 0.71\n",
"53932 0.70\n",
"53933 0.70\n",
"53934 0.72\n",
"53935 0.72\n",
"53936 0.72\n",
"53937 0.70\n",
"53938 0.86\n",
"53939 0.75\n",
"Name: carat, dtype: float64"
]
},
"execution_count": 190,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"caratseries"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#p=pd.Panel(matrix, items=0, major_axis=1, minor_axis=2)"
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 0.23\n",
"1 0.21\n",
"2 0.23\n",
"3 0.29\n",
"4 0.31\n",
"5 0.24\n",
"6 0.24\n",
"7 0.26\n",
"8 0.22\n",
"9 0.23\n",
"10 0.30\n",
"11 0.23\n",
"12 0.22\n",
"13 0.31\n",
"14 0.20\n",
"15 0.32\n",
"16 0.30\n",
"17 0.30\n",
"18 0.30\n",
"19 0.30\n",
"20 0.30\n",
"21 0.23\n",
"22 0.23\n",
"23 0.31\n",
"24 0.31\n",
"25 0.23\n",
"26 0.24\n",
"27 0.30\n",
"28 0.23\n",
"29 0.23\n",
" ... \n",
"53910 0.70\n",
"53911 0.57\n",
"53912 0.61\n",
"53913 0.80\n",
"53914 0.84\n",
"53915 0.77\n",
"53916 0.74\n",
"53917 0.90\n",
"53918 0.76\n",
"53919 0.76\n",
"53920 0.70\n",
"53921 0.70\n",
"53922 0.70\n",
"53923 0.73\n",
"53924 0.73\n",
"53925 0.79\n",
"53926 0.71\n",
"53927 0.79\n",
"53928 0.79\n",
"53929 0.71\n",
"53930 0.71\n",
"53931 0.71\n",
"53932 0.70\n",
"53933 0.70\n",
"53934 0.72\n",
"53935 0.72\n",
"53936 0.72\n",
"53937 0.70\n",
"53938 0.86\n",
"53939 0.75\n",
"Name: carat, dtype: float64"
]
},
"execution_count": 199,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamonds['carat']"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"diamonds2=diamonds.as_matrix(columns=diamonds.columns[1:])"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"numpy.ndarray"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(diamonds2)"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.23, 'Ideal', 'E', ..., 3.95, 3.98, 2.43],\n",
" [0.21, 'Premium', 'E', ..., 3.89, 3.84, 2.31],\n",
" [0.23, 'Good', 'E', ..., 4.05, 4.07, 2.31],\n",
" ..., \n",
" [0.7, 'Very Good', 'D', ..., 5.66, 5.68, 3.56],\n",
" [0.86, 'Premium', 'H', ..., 6.15, 6.12, 3.74],\n",
" [0.75, 'Ideal', 'D', ..., 5.83, 5.87, 3.64]], dtype=object)"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamonds2"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(53940, 10)"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamonds2.shape"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"53940"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(diamonds2)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"diamonds3=pd.DataFrame(data=diamonds2, # values\n",
" index=diamonds.index, # 1st column as index\n",
" columns=diamonds.columns[1:]) # 1st row as the column names"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 53940 entries, 0 to 53939\n",
"Data columns (total 10 columns):\n",
"carat 53940 non-null object\n",
"cut 53940 non-null object\n",
"color 53940 non-null object\n",
"clarity 53940 non-null object\n",
"depth 53940 non-null object\n",
"table 53940 non-null object\n",
"price 53940 non-null object\n",
"x 53940 non-null object\n",
"y 53940 non-null object\n",
"z 53940 non-null object\n",
"dtypes: object(10)\n",
"memory usage: 4.1+ MB\n"
]
}
],
"source": [
"diamonds3.info()"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"<class 'numpy.ndarray'>\n"
]
}
],
"source": [
"a=diamonds.iloc[:,1:]\n",
"b=diamonds.iloc[:,1:].values\n",
"\n",
"print(type(diamonds))\n",
"print(type(a))\n",
"print(type(b))"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.23</td>\n",
" <td>Ideal</td>\n",
" <td>E</td>\n",
" <td>SI2</td>\n",
" <td>61.5</td>\n",
" <td>55.0</td>\n",
" <td>326</td>\n",
" <td>3.95</td>\n",
" <td>3.98</td>\n",
" <td>2.43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.21</td>\n",
" <td>Premium</td>\n",
" <td>E</td>\n",
" <td>SI1</td>\n",
" <td>59.8</td>\n",
" <td>61.0</td>\n",
" <td>326</td>\n",
" <td>3.89</td>\n",
" <td>3.84</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.23</td>\n",
" <td>Good</td>\n",
" <td>E</td>\n",
" <td>VS1</td>\n",
" <td>56.9</td>\n",
" <td>65.0</td>\n",
" <td>327</td>\n",
" <td>4.05</td>\n",
" <td>4.07</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.29</td>\n",
" <td>Premium</td>\n",
" <td>I</td>\n",
" <td>VS2</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" <td>4.20</td>\n",
" <td>4.23</td>\n",
" <td>2.63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.31</td>\n",
" <td>Good</td>\n",
" <td>J</td>\n",
" <td>SI2</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" <td>4.34</td>\n",
" <td>4.35</td>\n",
" <td>2.75</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
"1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n",
"2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n",
"3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n",
"4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.head()"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"RangeIndex(start=0, stop=53940, step=1)"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.index"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',\n",
" 'z'],\n",
" dtype='object')"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.columns"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.23, 'Ideal', 'E', ..., 3.95, 3.98, 2.43],\n",
" [0.21, 'Premium', 'E', ..., 3.89, 3.84, 2.31],\n",
" [0.23, 'Good', 'E', ..., 4.05, 4.07, 2.31],\n",
" ..., \n",
" [0.7, 'Very Good', 'D', ..., 5.66, 5.68, 3.56],\n",
" [0.86, 'Premium', 'H', ..., 6.15, 6.12, 3.74],\n",
" [0.75, 'Ideal', 'D', ..., 5.83, 5.87, 3.64]], dtype=object)"
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.array(a)"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"darray=np.array(a)"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"diamonds4=pd.DataFrame(data=darray , # values\n",
" index=a.index, # 1st column as index\n",
" columns=a.columns) # 1st row as the column names"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.23</td>\n",
" <td>Ideal</td>\n",
" <td>E</td>\n",
" <td>SI2</td>\n",
" <td>61.5</td>\n",
" <td>55</td>\n",
" <td>326</td>\n",
" <td>3.95</td>\n",
" <td>3.98</td>\n",
" <td>2.43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.21</td>\n",
" <td>Premium</td>\n",
" <td>E</td>\n",
" <td>SI1</td>\n",
" <td>59.8</td>\n",
" <td>61</td>\n",
" <td>326</td>\n",
" <td>3.89</td>\n",
" <td>3.84</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.23</td>\n",
" <td>Good</td>\n",
" <td>E</td>\n",
" <td>VS1</td>\n",
" <td>56.9</td>\n",
" <td>65</td>\n",
" <td>327</td>\n",
" <td>4.05</td>\n",
" <td>4.07</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.29</td>\n",
" <td>Premium</td>\n",
" <td>I</td>\n",
" <td>VS2</td>\n",
" <td>62.4</td>\n",
" <td>58</td>\n",
" <td>334</td>\n",
" <td>4.2</td>\n",
" <td>4.23</td>\n",
" <td>2.63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.31</td>\n",
" <td>Good</td>\n",
" <td>J</td>\n",
" <td>SI2</td>\n",
" <td>63.3</td>\n",
" <td>58</td>\n",
" <td>335</td>\n",
" <td>4.34</td>\n",
" <td>4.35</td>\n",
" <td>2.75</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"0 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43\n",
"1 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31\n",
"2 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31\n",
"3 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63\n",
"4 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diamonds4.head()"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
" from sklearn import datasets"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"iris = datasets.load_iris()"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"sklearn.datasets.base.Bunch"
]
},
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(iris)"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(iris)"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['__class__',\n",
" '__contains__',\n",
" '__delattr__',\n",
" '__delitem__',\n",
" '__dict__',\n",
" '__dir__',\n",
" '__doc__',\n",
" '__eq__',\n",
" '__format__',\n",
" '__ge__',\n",
" '__getattr__',\n",
" '__getattribute__',\n",
" '__getitem__',\n",
" '__gt__',\n",
" '__hash__',\n",
" '__init__',\n",
" '__iter__',\n",
" '__le__',\n",
" '__len__',\n",
" '__lt__',\n",
" '__module__',\n",
" '__ne__',\n",
" '__new__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__setattr__',\n",
" '__setitem__',\n",
" '__setstate__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__subclasshook__',\n",
" '__weakref__',\n",
" 'clear',\n",
" 'copy',\n",
" 'fromkeys',\n",
" 'get',\n",
" 'items',\n",
" 'keys',\n",
" 'pop',\n",
" 'popitem',\n",
" 'setdefault',\n",
" 'update',\n",
" 'values']"
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dir(iris)"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dict_values(['Iris Plants Database\\n\\nNotes\\n-----\\nData Set Characteristics:\\n :Number of Instances: 150 (50 in each of three classes)\\n :Number of Attributes: 4 numeric, predictive attributes and the class\\n :Attribute Information:\\n - sepal length in cm\\n - sepal width in cm\\n - petal length in cm\\n - petal width in cm\\n - class:\\n - Iris-Setosa\\n - Iris-Versicolour\\n - Iris-Virginica\\n :Summary Statistics:\\n\\n ============== ==== ==== ======= ===== ====================\\n Min Max Mean SD Class Correlation\\n ============== ==== ==== ======= ===== ====================\\n sepal length: 4.3 7.9 5.84 0.83 0.7826\\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\\n ============== ==== ==== ======= ===== ====================\\n\\n :Missing Attribute Values: None\\n :Class Distribution: 33.3% for each of 3 classes.\\n :Creator: R.A. Fisher\\n :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\\n :Date: July, 1988\\n\\nThis is a copy of UCI ML iris datasets.\\nhttp://archive.ics.uci.edu/ml/datasets/Iris\\n\\nThe famous Iris database, first used by Sir R.A Fisher\\n\\nThis is perhaps the best known database to be found in the\\npattern recognition literature. Fisher\\'s paper is a classic in the field and\\nis referenced frequently to this day. (See Duda & Hart, for example.) The\\ndata set contains 3 classes of 50 instances each, where each class refers to a\\ntype of iris plant. One class is linearly separable from the other 2; the\\nlatter are NOT linearly separable from each other.\\n\\nReferences\\n----------\\n - Fisher,R.A. \"The use of multiple measurements in taxonomic problems\"\\n Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\\n Mathematical Statistics\" (John Wiley, NY, 1950).\\n - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\\n - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\\n Structure and Classification Rule for Recognition in Partially Exposed\\n Environments\". IEEE Transactions on Pattern Analysis and Machine\\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\\n - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\". IEEE Transactions\\n on Information Theory, May 1972, 431-433.\\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al\"s AUTOCLASS II\\n conceptual clustering system finds 3 classes in the data.\\n - Many, many more ...\\n', array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
" 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
" 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], array(['setosa', 'versicolor', 'virginica'], \n",
" dtype='<U10'), array([[ 5.1, 3.5, 1.4, 0.2],\n",
" [ 4.9, 3. , 1.4, 0.2],\n",
" [ 4.7, 3.2, 1.3, 0.2],\n",
" [ 4.6, 3.1, 1.5, 0.2],\n",
" [ 5. , 3.6, 1.4, 0.2],\n",
" [ 5.4, 3.9, 1.7, 0.4],\n",
" [ 4.6, 3.4, 1.4, 0.3],\n",
" [ 5. , 3.4, 1.5, 0.2],\n",
" [ 4.4, 2.9, 1.4, 0.2],\n",
" [ 4.9, 3.1, 1.5, 0.1],\n",
" [ 5.4, 3.7, 1.5, 0.2],\n",
" [ 4.8, 3.4, 1.6, 0.2],\n",
" [ 4.8, 3. , 1.4, 0.1],\n",
" [ 4.3, 3. , 1.1, 0.1],\n",
" [ 5.8, 4. , 1.2, 0.2],\n",
" [ 5.7, 4.4, 1.5, 0.4],\n",
" [ 5.4, 3.9, 1.3, 0.4],\n",
" [ 5.1, 3.5, 1.4, 0.3],\n",
" [ 5.7, 3.8, 1.7, 0.3],\n",
" [ 5.1, 3.8, 1.5, 0.3],\n",
" [ 5.4, 3.4, 1.7, 0.2],\n",
" [ 5.1, 3.7, 1.5, 0.4],\n",
" [ 4.6, 3.6, 1. , 0.2],\n",
" [ 5.1, 3.3, 1.7, 0.5],\n",
" [ 4.8, 3.4, 1.9, 0.2],\n",
" [ 5. , 3. , 1.6, 0.2],\n",
" [ 5. , 3.4, 1.6, 0.4],\n",
" [ 5.2, 3.5, 1.5, 0.2],\n",
" [ 5.2, 3.4, 1.4, 0.2],\n",
" [ 4.7, 3.2, 1.6, 0.2],\n",
" [ 4.8, 3.1, 1.6, 0.2],\n",
" [ 5.4, 3.4, 1.5, 0.4],\n",
" [ 5.2, 4.1, 1.5, 0.1],\n",
" [ 5.5, 4.2, 1.4, 0.2],\n",
" [ 4.9, 3.1, 1.5, 0.1],\n",
" [ 5. , 3.2, 1.2, 0.2],\n",
" [ 5.5, 3.5, 1.3, 0.2],\n",
" [ 4.9, 3.1, 1.5, 0.1],\n",
" [ 4.4, 3. , 1.3, 0.2],\n",
" [ 5.1, 3.4, 1.5, 0.2],\n",
" [ 5. , 3.5, 1.3, 0.3],\n",
" [ 4.5, 2.3, 1.3, 0.3],\n",
" [ 4.4, 3.2, 1.3, 0.2],\n",
" [ 5. , 3.5, 1.6, 0.6],\n",
" [ 5.1, 3.8, 1.9, 0.4],\n",
" [ 4.8, 3. , 1.4, 0.3],\n",
" [ 5.1, 3.8, 1.6, 0.2],\n",
" [ 4.6, 3.2, 1.4, 0.2],\n",
" [ 5.3, 3.7, 1.5, 0.2],\n",
" [ 5. , 3.3, 1.4, 0.2],\n",
" [ 7. , 3.2, 4.7, 1.4],\n",
" [ 6.4, 3.2, 4.5, 1.5],\n",
" [ 6.9, 3.1, 4.9, 1.5],\n",
" [ 5.5, 2.3, 4. , 1.3],\n",
" [ 6.5, 2.8, 4.6, 1.5],\n",
" [ 5.7, 2.8, 4.5, 1.3],\n",
" [ 6.3, 3.3, 4.7, 1.6],\n",
" [ 4.9, 2.4, 3.3, 1. ],\n",
" [ 6.6, 2.9, 4.6, 1.3],\n",
" [ 5.2, 2.7, 3.9, 1.4],\n",
" [ 5. , 2. , 3.5, 1. ],\n",
" [ 5.9, 3. , 4.2, 1.5],\n",
" [ 6. , 2.2, 4. , 1. ],\n",
" [ 6.1, 2.9, 4.7, 1.4],\n",
" [ 5.6, 2.9, 3.6, 1.3],\n",
" [ 6.7, 3.1, 4.4, 1.4],\n",
" [ 5.6, 3. , 4.5, 1.5],\n",
" [ 5.8, 2.7, 4.1, 1. ],\n",
" [ 6.2, 2.2, 4.5, 1.5],\n",
" [ 5.6, 2.5, 3.9, 1.1],\n",
" [ 5.9, 3.2, 4.8, 1.8],\n",
" [ 6.1, 2.8, 4. , 1.3],\n",
" [ 6.3, 2.5, 4.9, 1.5],\n",
" [ 6.1, 2.8, 4.7, 1.2],\n",
" [ 6.4, 2.9, 4.3, 1.3],\n",
" [ 6.6, 3. , 4.4, 1.4],\n",
" [ 6.8, 2.8, 4.8, 1.4],\n",
" [ 6.7, 3. , 5. , 1.7],\n",
" [ 6. , 2.9, 4.5, 1.5],\n",
" [ 5.7, 2.6, 3.5, 1. ],\n",
" [ 5.5, 2.4, 3.8, 1.1],\n",
" [ 5.5, 2.4, 3.7, 1. ],\n",
" [ 5.8, 2.7, 3.9, 1.2],\n",
" [ 6. , 2.7, 5.1, 1.6],\n",
" [ 5.4, 3. , 4.5, 1.5],\n",
" [ 6. , 3.4, 4.5, 1.6],\n",
" [ 6.7, 3.1, 4.7, 1.5],\n",
" [ 6.3, 2.3, 4.4, 1.3],\n",
" [ 5.6, 3. , 4.1, 1.3],\n",
" [ 5.5, 2.5, 4. , 1.3],\n",
" [ 5.5, 2.6, 4.4, 1.2],\n",
" [ 6.1, 3. , 4.6, 1.4],\n",
" [ 5.8, 2.6, 4. , 1.2],\n",
" [ 5. , 2.3, 3.3, 1. ],\n",
" [ 5.6, 2.7, 4.2, 1.3],\n",
" [ 5.7, 3. , 4.2, 1.2],\n",
" [ 5.7, 2.9, 4.2, 1.3],\n",
" [ 6.2, 2.9, 4.3, 1.3],\n",
" [ 5.1, 2.5, 3. , 1.1],\n",
" [ 5.7, 2.8, 4.1, 1.3],\n",
" [ 6.3, 3.3, 6. , 2.5],\n",
" [ 5.8, 2.7, 5.1, 1.9],\n",
" [ 7.1, 3. , 5.9, 2.1],\n",
" [ 6.3, 2.9, 5.6, 1.8],\n",
" [ 6.5, 3. , 5.8, 2.2],\n",
" [ 7.6, 3. , 6.6, 2.1],\n",
" [ 4.9, 2.5, 4.5, 1.7],\n",
" [ 7.3, 2.9, 6.3, 1.8],\n",
" [ 6.7, 2.5, 5.8, 1.8],\n",
" [ 7.2, 3.6, 6.1, 2.5],\n",
" [ 6.5, 3.2, 5.1, 2. ],\n",
" [ 6.4, 2.7, 5.3, 1.9],\n",
" [ 6.8, 3. , 5.5, 2.1],\n",
" [ 5.7, 2.5, 5. , 2. ],\n",
" [ 5.8, 2.8, 5.1, 2.4],\n",
" [ 6.4, 3.2, 5.3, 2.3],\n",
" [ 6.5, 3. , 5.5, 1.8],\n",
" [ 7.7, 3.8, 6.7, 2.2],\n",
" [ 7.7, 2.6, 6.9, 2.3],\n",
" [ 6. , 2.2, 5. , 1.5],\n",
" [ 6.9, 3.2, 5.7, 2.3],\n",
" [ 5.6, 2.8, 4.9, 2. ],\n",
" [ 7.7, 2.8, 6.7, 2. ],\n",
" [ 6.3, 2.7, 4.9, 1.8],\n",
" [ 6.7, 3.3, 5.7, 2.1],\n",
" [ 7.2, 3.2, 6. , 1.8],\n",
" [ 6.2, 2.8, 4.8, 1.8],\n",
" [ 6.1, 3. , 4.9, 1.8],\n",
" [ 6.4, 2.8, 5.6, 2.1],\n",
" [ 7.2, 3. , 5.8, 1.6],\n",
" [ 7.4, 2.8, 6.1, 1.9],\n",
" [ 7.9, 3.8, 6.4, 2. ],\n",
" [ 6.4, 2.8, 5.6, 2.2],\n",
" [ 6.3, 2.8, 5.1, 1.5],\n",
" [ 6.1, 2.6, 5.6, 1.4],\n",
" [ 7.7, 3. , 6.1, 2.3],\n",
" [ 6.3, 3.4, 5.6, 2.4],\n",
" [ 6.4, 3.1, 5.5, 1.8],\n",
" [ 6. , 3. , 4.8, 1.8],\n",
" [ 6.9, 3.1, 5.4, 2.1],\n",
" [ 6.7, 3.1, 5.6, 2.4],\n",
" [ 6.9, 3.1, 5.1, 2.3],\n",
" [ 5.8, 2.7, 5.1, 1.9],\n",
" [ 6.8, 3.2, 5.9, 2.3],\n",
" [ 6.7, 3.3, 5.7, 2.5],\n",
" [ 6.7, 3. , 5.2, 2.3],\n",
" [ 6.3, 2.5, 5. , 1.9],\n",
" [ 6.5, 3. , 5.2, 2. ],\n",
" [ 6.2, 3.4, 5.4, 2.3],\n",
" [ 5.9, 3. , 5.1, 1.8]])])"
]
},
"execution_count": 137,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.values()"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dict_items([('DESCR', 'Iris Plants Database\\n\\nNotes\\n-----\\nData Set Characteristics:\\n :Number of Instances: 150 (50 in each of three classes)\\n :Number of Attributes: 4 numeric, predictive attributes and the class\\n :Attribute Information:\\n - sepal length in cm\\n - sepal width in cm\\n - petal length in cm\\n - petal width in cm\\n - class:\\n - Iris-Setosa\\n - Iris-Versicolour\\n - Iris-Virginica\\n :Summary Statistics:\\n\\n ============== ==== ==== ======= ===== ====================\\n Min Max Mean SD Class Correlation\\n ============== ==== ==== ======= ===== ====================\\n sepal length: 4.3 7.9 5.84 0.83 0.7826\\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\\n ============== ==== ==== ======= ===== ====================\\n\\n :Missing Attribute Values: None\\n :Class Distribution: 33.3% for each of 3 classes.\\n :Creator: R.A. Fisher\\n :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\\n :Date: July, 1988\\n\\nThis is a copy of UCI ML iris datasets.\\nhttp://archive.ics.uci.edu/ml/datasets/Iris\\n\\nThe famous Iris database, first used by Sir R.A Fisher\\n\\nThis is perhaps the best known database to be found in the\\npattern recognition literature. Fisher\\'s paper is a classic in the field and\\nis referenced frequently to this day. (See Duda & Hart, for example.) The\\ndata set contains 3 classes of 50 instances each, where each class refers to a\\ntype of iris plant. One class is linearly separable from the other 2; the\\nlatter are NOT linearly separable from each other.\\n\\nReferences\\n----------\\n - Fisher,R.A. \"The use of multiple measurements in taxonomic problems\"\\n Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\\n Mathematical Statistics\" (John Wiley, NY, 1950).\\n - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\\n - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\\n Structure and Classification Rule for Recognition in Partially Exposed\\n Environments\". IEEE Transactions on Pattern Analysis and Machine\\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\\n - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\". IEEE Transactions\\n on Information Theory, May 1972, 431-433.\\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al\"s AUTOCLASS II\\n conceptual clustering system finds 3 classes in the data.\\n - Many, many more ...\\n'), ('target', array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
" 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
" 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])), ('feature_names', ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']), ('target_names', array(['setosa', 'versicolor', 'virginica'], \n",
" dtype='<U10')), ('data', array([[ 5.1, 3.5, 1.4, 0.2],\n",
" [ 4.9, 3. , 1.4, 0.2],\n",
" [ 4.7, 3.2, 1.3, 0.2],\n",
" [ 4.6, 3.1, 1.5, 0.2],\n",
" [ 5. , 3.6, 1.4, 0.2],\n",
" [ 5.4, 3.9, 1.7, 0.4],\n",
" [ 4.6, 3.4, 1.4, 0.3],\n",
" [ 5. , 3.4, 1.5, 0.2],\n",
" [ 4.4, 2.9, 1.4, 0.2],\n",
" [ 4.9, 3.1, 1.5, 0.1],\n",
" [ 5.4, 3.7, 1.5, 0.2],\n",
" [ 4.8, 3.4, 1.6, 0.2],\n",
" [ 4.8, 3. , 1.4, 0.1],\n",
" [ 4.3, 3. , 1.1, 0.1],\n",
" [ 5.8, 4. , 1.2, 0.2],\n",
" [ 5.7, 4.4, 1.5, 0.4],\n",
" [ 5.4, 3.9, 1.3, 0.4],\n",
" [ 5.1, 3.5, 1.4, 0.3],\n",
" [ 5.7, 3.8, 1.7, 0.3],\n",
" [ 5.1, 3.8, 1.5, 0.3],\n",
" [ 5.4, 3.4, 1.7, 0.2],\n",
" [ 5.1, 3.7, 1.5, 0.4],\n",
" [ 4.6, 3.6, 1. , 0.2],\n",
" [ 5.1, 3.3, 1.7, 0.5],\n",
" [ 4.8, 3.4, 1.9, 0.2],\n",
" [ 5. , 3. , 1.6, 0.2],\n",
" [ 5. , 3.4, 1.6, 0.4],\n",
" [ 5.2, 3.5, 1.5, 0.2],\n",
" [ 5.2, 3.4, 1.4, 0.2],\n",
" [ 4.7, 3.2, 1.6, 0.2],\n",
" [ 4.8, 3.1, 1.6, 0.2],\n",
" [ 5.4, 3.4, 1.5, 0.4],\n",
" [ 5.2, 4.1, 1.5, 0.1],\n",
" [ 5.5, 4.2, 1.4, 0.2],\n",
" [ 4.9, 3.1, 1.5, 0.1],\n",
" [ 5. , 3.2, 1.2, 0.2],\n",
" [ 5.5, 3.5, 1.3, 0.2],\n",
" [ 4.9, 3.1, 1.5, 0.1],\n",
" [ 4.4, 3. , 1.3, 0.2],\n",
" [ 5.1, 3.4, 1.5, 0.2],\n",
" [ 5. , 3.5, 1.3, 0.3],\n",
" [ 4.5, 2.3, 1.3, 0.3],\n",
" [ 4.4, 3.2, 1.3, 0.2],\n",
" [ 5. , 3.5, 1.6, 0.6],\n",
" [ 5.1, 3.8, 1.9, 0.4],\n",
" [ 4.8, 3. , 1.4, 0.3],\n",
" [ 5.1, 3.8, 1.6, 0.2],\n",
" [ 4.6, 3.2, 1.4, 0.2],\n",
" [ 5.3, 3.7, 1.5, 0.2],\n",
" [ 5. , 3.3, 1.4, 0.2],\n",
" [ 7. , 3.2, 4.7, 1.4],\n",
" [ 6.4, 3.2, 4.5, 1.5],\n",
" [ 6.9, 3.1, 4.9, 1.5],\n",
" [ 5.5, 2.3, 4. , 1.3],\n",
" [ 6.5, 2.8, 4.6, 1.5],\n",
" [ 5.7, 2.8, 4.5, 1.3],\n",
" [ 6.3, 3.3, 4.7, 1.6],\n",
" [ 4.9, 2.4, 3.3, 1. ],\n",
" [ 6.6, 2.9, 4.6, 1.3],\n",
" [ 5.2, 2.7, 3.9, 1.4],\n",
" [ 5. , 2. , 3.5, 1. ],\n",
" [ 5.9, 3. , 4.2, 1.5],\n",
" [ 6. , 2.2, 4. , 1. ],\n",
" [ 6.1, 2.9, 4.7, 1.4],\n",
" [ 5.6, 2.9, 3.6, 1.3],\n",
" [ 6.7, 3.1, 4.4, 1.4],\n",
" [ 5.6, 3. , 4.5, 1.5],\n",
" [ 5.8, 2.7, 4.1, 1. ],\n",
" [ 6.2, 2.2, 4.5, 1.5],\n",
" [ 5.6, 2.5, 3.9, 1.1],\n",
" [ 5.9, 3.2, 4.8, 1.8],\n",
" [ 6.1, 2.8, 4. , 1.3],\n",
" [ 6.3, 2.5, 4.9, 1.5],\n",
" [ 6.1, 2.8, 4.7, 1.2],\n",
" [ 6.4, 2.9, 4.3, 1.3],\n",
" [ 6.6, 3. , 4.4, 1.4],\n",
" [ 6.8, 2.8, 4.8, 1.4],\n",
" [ 6.7, 3. , 5. , 1.7],\n",
" [ 6. , 2.9, 4.5, 1.5],\n",
" [ 5.7, 2.6, 3.5, 1. ],\n",
" [ 5.5, 2.4, 3.8, 1.1],\n",
" [ 5.5, 2.4, 3.7, 1. ],\n",
" [ 5.8, 2.7, 3.9, 1.2],\n",
" [ 6. , 2.7, 5.1, 1.6],\n",
" [ 5.4, 3. , 4.5, 1.5],\n",
" [ 6. , 3.4, 4.5, 1.6],\n",
" [ 6.7, 3.1, 4.7, 1.5],\n",
" [ 6.3, 2.3, 4.4, 1.3],\n",
" [ 5.6, 3. , 4.1, 1.3],\n",
" [ 5.5, 2.5, 4. , 1.3],\n",
" [ 5.5, 2.6, 4.4, 1.2],\n",
" [ 6.1, 3. , 4.6, 1.4],\n",
" [ 5.8, 2.6, 4. , 1.2],\n",
" [ 5. , 2.3, 3.3, 1. ],\n",
" [ 5.6, 2.7, 4.2, 1.3],\n",
" [ 5.7, 3. , 4.2, 1.2],\n",
" [ 5.7, 2.9, 4.2, 1.3],\n",
" [ 6.2, 2.9, 4.3, 1.3],\n",
" [ 5.1, 2.5, 3. , 1.1],\n",
" [ 5.7, 2.8, 4.1, 1.3],\n",
" [ 6.3, 3.3, 6. , 2.5],\n",
" [ 5.8, 2.7, 5.1, 1.9],\n",
" [ 7.1, 3. , 5.9, 2.1],\n",
" [ 6.3, 2.9, 5.6, 1.8],\n",
" [ 6.5, 3. , 5.8, 2.2],\n",
" [ 7.6, 3. , 6.6, 2.1],\n",
" [ 4.9, 2.5, 4.5, 1.7],\n",
" [ 7.3, 2.9, 6.3, 1.8],\n",
" [ 6.7, 2.5, 5.8, 1.8],\n",
" [ 7.2, 3.6, 6.1, 2.5],\n",
" [ 6.5, 3.2, 5.1, 2. ],\n",
" [ 6.4, 2.7, 5.3, 1.9],\n",
" [ 6.8, 3. , 5.5, 2.1],\n",
" [ 5.7, 2.5, 5. , 2. ],\n",
" [ 5.8, 2.8, 5.1, 2.4],\n",
" [ 6.4, 3.2, 5.3, 2.3],\n",
" [ 6.5, 3. , 5.5, 1.8],\n",
" [ 7.7, 3.8, 6.7, 2.2],\n",
" [ 7.7, 2.6, 6.9, 2.3],\n",
" [ 6. , 2.2, 5. , 1.5],\n",
" [ 6.9, 3.2, 5.7, 2.3],\n",
" [ 5.6, 2.8, 4.9, 2. ],\n",
" [ 7.7, 2.8, 6.7, 2. ],\n",
" [ 6.3, 2.7, 4.9, 1.8],\n",
" [ 6.7, 3.3, 5.7, 2.1],\n",
" [ 7.2, 3.2, 6. , 1.8],\n",
" [ 6.2, 2.8, 4.8, 1.8],\n",
" [ 6.1, 3. , 4.9, 1.8],\n",
" [ 6.4, 2.8, 5.6, 2.1],\n",
" [ 7.2, 3. , 5.8, 1.6],\n",
" [ 7.4, 2.8, 6.1, 1.9],\n",
" [ 7.9, 3.8, 6.4, 2. ],\n",
" [ 6.4, 2.8, 5.6, 2.2],\n",
" [ 6.3, 2.8, 5.1, 1.5],\n",
" [ 6.1, 2.6, 5.6, 1.4],\n",
" [ 7.7, 3. , 6.1, 2.3],\n",
" [ 6.3, 3.4, 5.6, 2.4],\n",
" [ 6.4, 3.1, 5.5, 1.8],\n",
" [ 6. , 3. , 4.8, 1.8],\n",
" [ 6.9, 3.1, 5.4, 2.1],\n",
" [ 6.7, 3.1, 5.6, 2.4],\n",
" [ 6.9, 3.1, 5.1, 2.3],\n",
" [ 5.8, 2.7, 5.1, 1.9],\n",
" [ 6.8, 3.2, 5.9, 2.3],\n",
" [ 6.7, 3.3, 5.7, 2.5],\n",
" [ 6.7, 3. , 5.2, 2.3],\n",
" [ 6.3, 2.5, 5. , 1.9],\n",
" [ 6.5, 3. , 5.2, 2. ],\n",
" [ 6.2, 3.4, 5.4, 2.3],\n",
" [ 5.9, 3. , 5.1, 1.8]]))])"
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.items()"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Also see http://nbviewer.jupyter.org/github/ogrisel/parallel_ml_tutorial/blob/master/rendered_notebooks/04%20-%20Pandas%20and%20Heterogeneous%20Data%20Modeling.ipynb"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment