Skip to content

Instantly share code, notes, and snippets.

Created October 12, 2015 07:18
Show Gist options
  • Save willard-yuan/c827db48b9c414716b90 to your computer and use it in GitHub Desktop.
Save willard-yuan/c827db48b9c414716b90 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
"cells": [
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
"outputs": [
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>class</th>\n",
" <th>petal_length</th>\n",
" <th>petal_width</th>\n",
" <th>sepal_length</th>\n",
" <th>sepal_width</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Iris-virginica</td>\n",
" <td>5.5</td>\n",
" <td>1.8</td>\n",
" <td>6.4</td>\n",
" <td>3.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Iris-virginica</td>\n",
" <td>5.9</td>\n",
" <td>2.3</td>\n",
" <td>6.8</td>\n",
" <td>3.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Iris-virginica</td>\n",
" <td>5.4</td>\n",
" <td>2.3</td>\n",
" <td>6.2</td>\n",
" <td>3.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Iris-virginica</td>\n",
" <td>4.8</td>\n",
" <td>1.8</td>\n",
" <td>6.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Iris-virginica</td>\n",
" <td>5.1</td>\n",
" <td>2.3</td>\n",
" <td>6.9</td>\n",
" <td>3.1</td>\n",
" </tr>\n",
" </tbody>\n",
"text/plain": [
" class petal_length petal_width sepal_length sepal_width\n",
"0 Iris-virginica 5.5 1.8 6.4 3.1\n",
"1 Iris-virginica 5.9 2.3 6.8 3.2\n",
"2 Iris-virginica 5.4 2.3 6.2 3.4\n",
"3 Iris-virginica 4.8 1.8 6.0 3.0\n",
"4 Iris-virginica 5.1 2.3 6.9 3.1"
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
"source": [
"# First let's import the dataset, using Pandas.\n",
"import pandas as pd\n",
"train = pd.read_csv(\"train.csv\") # make sure you're in the right directory if using iPython!\n",
"test = pd.read_csv(\"test.csv\") \n",
"train.head() # ignore the first column, it's how I split the data."
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
"outputs": [
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/alexwoods/Downloads/ipython-3.2.0/IPython/kernel/ DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n"
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False)"
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"# however, are data has to be in a numpy array in order for the random forest algorithm to except it!\n",
"cols = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']\n",
"colsRes = ['class']\n",
"trainArr = train.as_matrix(cols) # training array\n",
"trainRes = train.as_matrix(colsRes) # training results\n",
"## Training!\n",
"rf = RandomForestClassifier(n_estimators=100) # 100 decision trees is a good enough number\n",
", trainRes) # finally, we fit the data to the algorithm!!! :)\n",
"# note - you might get an warning saying you entered a 2 column vector..ignore it."
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
"outputs": [
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>class</th>\n",
" <th>petal_length</th>\n",
" <th>petal_width</th>\n",
" <th>sepal_length</th>\n",
" <th>sepal_width</th>\n",
" <th>predictions</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Iris-virginica</td>\n",
" <td>6.6</td>\n",
" <td>2.1</td>\n",
" <td>7.6</td>\n",
" <td>3.0</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Iris-virginica</td>\n",
" <td>6.3</td>\n",
" <td>1.8</td>\n",
" <td>7.3</td>\n",
" <td>2.9</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Iris-virginica</td>\n",
" <td>5.5</td>\n",
" <td>2.1</td>\n",
" <td>6.8</td>\n",
" <td>3.0</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Iris-virginica</td>\n",
" <td>5.1</td>\n",
" <td>2.4</td>\n",
" <td>5.8</td>\n",
" <td>2.8</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Iris-virginica</td>\n",
" <td>5.3</td>\n",
" <td>2.3</td>\n",
" <td>6.4</td>\n",
" <td>3.2</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" </tbody>\n",
"text/plain": [
" class petal_length petal_width sepal_length sepal_width \\\n",
"0 Iris-virginica 6.6 2.1 7.6 3.0 \n",
"1 Iris-virginica 6.3 1.8 7.3 2.9 \n",
"2 Iris-virginica 5.5 2.1 6.8 3.0 \n",
"3 Iris-virginica 5.1 2.4 5.8 2.8 \n",
"4 Iris-virginica 5.3 2.3 6.4 3.2 \n",
" predictions \n",
"0 Iris-virginica \n",
"1 Iris-virginica \n",
"2 Iris-virginica \n",
"3 Iris-virginica \n",
"4 Iris-virginica "
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
"source": [
"## Testing!\n",
"# put the test results in the same format!\n",
"testArr = test.as_matrix(cols)\n",
"results = rf.predict(testArr)\n",
"# something I like to do is to add it back to the dataframe, so I can compare side-by-side\n",
"test['predictions'] = results\n",
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.1"
"nbformat": 4,
"nbformat_minor": 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment