Created
July 9, 2015 12:20
-
-
Save fccoelho/2f13f0624cd75a1a255a to your computer and use it in GitHub Desktop.
PySpark Logistic regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<pyspark.context.SparkContext at 0x7f3c7c2e52e8>" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sc" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.mllib.regression import LabeledPoint\n", | |
"from pyspark.mllib.classification import LogisticRegressionWithSGD\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" <th>4</th>\n", | |
" <th>5</th>\n", | |
" <th>6</th>\n", | |
" <th>7</th>\n", | |
" <th>8</th>\n", | |
" <th>9</th>\n", | |
" <th>10</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>-4.19290</td>\n", | |
" <td>2.517100</td>\n", | |
" <td>0.473780</td>\n", | |
" <td>-0.628430</td>\n", | |
" <td>-2.687400</td>\n", | |
" <td>0.28546</td>\n", | |
" <td>0.191070</td>\n", | |
" <td>-0.127260</td>\n", | |
" <td>0.630760</td>\n", | |
" <td>-1.673800</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>-6.68060</td>\n", | |
" <td>2.969700</td>\n", | |
" <td>0.990980</td>\n", | |
" <td>2.466200</td>\n", | |
" <td>2.115500</td>\n", | |
" <td>-0.27363</td>\n", | |
" <td>-0.371340</td>\n", | |
" <td>-0.212700</td>\n", | |
" <td>1.974800</td>\n", | |
" <td>1.557300</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>-0.25967</td>\n", | |
" <td>0.423830</td>\n", | |
" <td>0.089746</td>\n", | |
" <td>0.103350</td>\n", | |
" <td>-0.210260</td>\n", | |
" <td>-0.31418</td>\n", | |
" <td>0.092722</td>\n", | |
" <td>0.262780</td>\n", | |
" <td>0.141460</td>\n", | |
" <td>0.009772</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>-1.15640</td>\n", | |
" <td>0.848080</td>\n", | |
" <td>-0.259460</td>\n", | |
" <td>-0.411420</td>\n", | |
" <td>-1.437400</td>\n", | |
" <td>-0.43879</td>\n", | |
" <td>0.607720</td>\n", | |
" <td>0.121940</td>\n", | |
" <td>0.441890</td>\n", | |
" <td>0.005098</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>-0.66666</td>\n", | |
" <td>0.214390</td>\n", | |
" <td>0.041861</td>\n", | |
" <td>-0.129960</td>\n", | |
" <td>-0.363050</td>\n", | |
" <td>-0.39517</td>\n", | |
" <td>0.053495</td>\n", | |
" <td>0.076691</td>\n", | |
" <td>0.398830</td>\n", | |
" <td>-0.373070</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>-0.17517</td>\n", | |
" <td>0.043968</td>\n", | |
" <td>-0.316620</td>\n", | |
" <td>-0.042977</td>\n", | |
" <td>0.122490</td>\n", | |
" <td>-0.28860</td>\n", | |
" <td>-0.040541</td>\n", | |
" <td>0.000970</td>\n", | |
" <td>-0.246240</td>\n", | |
" <td>-0.068102</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>-1.34120</td>\n", | |
" <td>-1.162900</td>\n", | |
" <td>-0.029398</td>\n", | |
" <td>-0.135130</td>\n", | |
" <td>0.497580</td>\n", | |
" <td>-1.47260</td>\n", | |
" <td>-0.007152</td>\n", | |
" <td>-0.220570</td>\n", | |
" <td>0.101510</td>\n", | |
" <td>0.074106</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>-0.66152</td>\n", | |
" <td>0.849280</td>\n", | |
" <td>0.281970</td>\n", | |
" <td>0.030402</td>\n", | |
" <td>-0.294750</td>\n", | |
" <td>-0.43598</td>\n", | |
" <td>0.037590</td>\n", | |
" <td>0.289210</td>\n", | |
" <td>0.316870</td>\n", | |
" <td>-0.276840</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>-1.74920</td>\n", | |
" <td>1.705500</td>\n", | |
" <td>0.545230</td>\n", | |
" <td>0.648500</td>\n", | |
" <td>-0.044507</td>\n", | |
" <td>-0.95528</td>\n", | |
" <td>-1.418200</td>\n", | |
" <td>0.743170</td>\n", | |
" <td>-0.007359</td>\n", | |
" <td>-0.074117</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>-1.04130</td>\n", | |
" <td>0.246810</td>\n", | |
" <td>0.140320</td>\n", | |
" <td>-0.858140</td>\n", | |
" <td>0.474890</td>\n", | |
" <td>-0.45504</td>\n", | |
" <td>0.168170</td>\n", | |
" <td>-0.232990</td>\n", | |
" <td>0.061892</td>\n", | |
" <td>-0.133630</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0 1 2 3 4 5 6 \\\n", | |
"0 -4.19290 2.517100 0.473780 -0.628430 -2.687400 0.28546 0.191070 \n", | |
"1 -6.68060 2.969700 0.990980 2.466200 2.115500 -0.27363 -0.371340 \n", | |
"2 -0.25967 0.423830 0.089746 0.103350 -0.210260 -0.31418 0.092722 \n", | |
"3 -1.15640 0.848080 -0.259460 -0.411420 -1.437400 -0.43879 0.607720 \n", | |
"4 -0.66666 0.214390 0.041861 -0.129960 -0.363050 -0.39517 0.053495 \n", | |
"5 -0.17517 0.043968 -0.316620 -0.042977 0.122490 -0.28860 -0.040541 \n", | |
"6 -1.34120 -1.162900 -0.029398 -0.135130 0.497580 -1.47260 -0.007152 \n", | |
"7 -0.66152 0.849280 0.281970 0.030402 -0.294750 -0.43598 0.037590 \n", | |
"8 -1.74920 1.705500 0.545230 0.648500 -0.044507 -0.95528 -1.418200 \n", | |
"9 -1.04130 0.246810 0.140320 -0.858140 0.474890 -0.45504 0.168170 \n", | |
"\n", | |
" 7 8 9 10 \n", | |
"0 -0.127260 0.630760 -1.673800 0 \n", | |
"1 -0.212700 1.974800 1.557300 0 \n", | |
"2 0.262780 0.141460 0.009772 0 \n", | |
"3 0.121940 0.441890 0.005098 0 \n", | |
"4 0.076691 0.398830 -0.373070 0 \n", | |
"5 0.000970 -0.246240 -0.068102 0 \n", | |
"6 -0.220570 0.101510 0.074106 0 \n", | |
"7 0.289210 0.316870 -0.276840 0 \n", | |
"8 0.743170 -0.007359 -0.074117 1 \n", | |
"9 -0.232990 0.061892 -0.133630 0 " | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_csv(\"ds1.10.csv\", header=None)\n", | |
"df[:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def gen_points(df):\n", | |
" for row in df.iterrows():\n", | |
" row = list(row[1])\n", | |
" yield LabeledPoint(row[-1],row[1:-1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"points = gen_points(df)\n", | |
"iterations=100\n", | |
"#list(points)[:10]\n", | |
"model = LogisticRegressionWithSGD.train(sc.parallelize(points), iterations)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Final weights: [-0.0669379287979,0.0168760757513,0.1144983594,-0.00741260878604,0.199330983325,0.103499320816,-0.00584475280362,-0.134381329648,0.169482551779]\n", | |
"Final intercept: 0.0\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Final weights: \" + str(model.weights))\n", | |
"print(\"Final intercept: \" + str(model.intercept))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment