Last active
April 30, 2018 07:18
-
-
Save drmingle/e1132803170d4a22244ed70d93f048c0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"---\n", | |
"title: \"Make Simulated Data For Classification\"\n", | |
"author: \"Damian Mingle\"\n", | |
"date: 04/30/2018\n", | |
"---" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Preliminaries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.datasets import make_classification\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Create Simulated Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create a simulated feature matrix with output vector using 1000 samples,\n", | |
"features, output = make_classification(n_samples = 1000,\n", | |
" # eight features\n", | |
" n_features = 8,\n", | |
" # five features that actually predict the output's classes\n", | |
" n_informative = 5,\n", | |
" # three features that are are random to class output\n", | |
" n_redundant = 3,\n", | |
" # four classes output\n", | |
" n_classes = 4,\n", | |
" # with 15% of observations in the 1st class, 5% in the 2nd class, \n", | |
" # 30% in the 3rd class, and 50% in the third class. \n", | |
" weights = [.15,.05, .3, .5])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## View Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" <th>4</th>\n", | |
" <th>5</th>\n", | |
" <th>6</th>\n", | |
" <th>7</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>-1.329618</td>\n", | |
" <td>0.177556</td>\n", | |
" <td>-1.006781</td>\n", | |
" <td>1.066200</td>\n", | |
" <td>2.693715</td>\n", | |
" <td>1.771836</td>\n", | |
" <td>1.409192</td>\n", | |
" <td>0.735523</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>-0.044946</td>\n", | |
" <td>-0.744279</td>\n", | |
" <td>0.759978</td>\n", | |
" <td>-0.399401</td>\n", | |
" <td>0.488662</td>\n", | |
" <td>-0.039110</td>\n", | |
" <td>-0.329099</td>\n", | |
" <td>0.556787</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>-0.969531</td>\n", | |
" <td>-0.348288</td>\n", | |
" <td>-0.080486</td>\n", | |
" <td>1.121009</td>\n", | |
" <td>2.568738</td>\n", | |
" <td>1.786202</td>\n", | |
" <td>1.220312</td>\n", | |
" <td>1.432734</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0.422715</td>\n", | |
" <td>-0.389804</td>\n", | |
" <td>0.795004</td>\n", | |
" <td>-0.680532</td>\n", | |
" <td>1.211953</td>\n", | |
" <td>0.651004</td>\n", | |
" <td>0.376306</td>\n", | |
" <td>-0.318542</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0.394632</td>\n", | |
" <td>-0.091617</td>\n", | |
" <td>0.364400</td>\n", | |
" <td>-0.663502</td>\n", | |
" <td>-0.141405</td>\n", | |
" <td>-0.111323</td>\n", | |
" <td>0.170971</td>\n", | |
" <td>0.519450</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0 1 2 3 4 5 6 \\\n", | |
"0 -1.329618 0.177556 -1.006781 1.066200 2.693715 1.771836 1.409192 \n", | |
"1 -0.044946 -0.744279 0.759978 -0.399401 0.488662 -0.039110 -0.329099 \n", | |
"2 -0.969531 -0.348288 -0.080486 1.121009 2.568738 1.786202 1.220312 \n", | |
"3 0.422715 -0.389804 0.795004 -0.680532 1.211953 0.651004 0.376306 \n", | |
"4 0.394632 -0.091617 0.364400 -0.663502 -0.141405 -0.111323 0.170971 \n", | |
"\n", | |
" 7 \n", | |
"0 0.735523 \n", | |
"1 0.556787 \n", | |
"2 1.432734 \n", | |
"3 -0.318542 \n", | |
"4 0.519450 " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# View the first five observations and their 10 features\n", | |
"pd.DataFrame(features).head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0\n", | |
"0 2\n", | |
"1 2\n", | |
"2 3\n", | |
"3 2\n", | |
"4 2" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# View the first five observation's classes\n", | |
"pd.DataFrame(output).head()" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment