Skip to content

Instantly share code, notes, and snippets.

@fccoelho
Created July 9, 2015 12:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fccoelho/2f13f0624cd75a1a255a to your computer and use it in GitHub Desktop.
Save fccoelho/2f13f0624cd75a1a255a to your computer and use it in GitHub Desktop.
PySpark Logistic regression
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<pyspark.context.SparkContext at 0x7f3c7c2e52e8>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sc"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pyspark.mllib.regression import LabeledPoint\n",
"from pyspark.mllib.classification import LogisticRegressionWithSGD\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-4.19290</td>\n",
" <td>2.517100</td>\n",
" <td>0.473780</td>\n",
" <td>-0.628430</td>\n",
" <td>-2.687400</td>\n",
" <td>0.28546</td>\n",
" <td>0.191070</td>\n",
" <td>-0.127260</td>\n",
" <td>0.630760</td>\n",
" <td>-1.673800</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-6.68060</td>\n",
" <td>2.969700</td>\n",
" <td>0.990980</td>\n",
" <td>2.466200</td>\n",
" <td>2.115500</td>\n",
" <td>-0.27363</td>\n",
" <td>-0.371340</td>\n",
" <td>-0.212700</td>\n",
" <td>1.974800</td>\n",
" <td>1.557300</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-0.25967</td>\n",
" <td>0.423830</td>\n",
" <td>0.089746</td>\n",
" <td>0.103350</td>\n",
" <td>-0.210260</td>\n",
" <td>-0.31418</td>\n",
" <td>0.092722</td>\n",
" <td>0.262780</td>\n",
" <td>0.141460</td>\n",
" <td>0.009772</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-1.15640</td>\n",
" <td>0.848080</td>\n",
" <td>-0.259460</td>\n",
" <td>-0.411420</td>\n",
" <td>-1.437400</td>\n",
" <td>-0.43879</td>\n",
" <td>0.607720</td>\n",
" <td>0.121940</td>\n",
" <td>0.441890</td>\n",
" <td>0.005098</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-0.66666</td>\n",
" <td>0.214390</td>\n",
" <td>0.041861</td>\n",
" <td>-0.129960</td>\n",
" <td>-0.363050</td>\n",
" <td>-0.39517</td>\n",
" <td>0.053495</td>\n",
" <td>0.076691</td>\n",
" <td>0.398830</td>\n",
" <td>-0.373070</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>-0.17517</td>\n",
" <td>0.043968</td>\n",
" <td>-0.316620</td>\n",
" <td>-0.042977</td>\n",
" <td>0.122490</td>\n",
" <td>-0.28860</td>\n",
" <td>-0.040541</td>\n",
" <td>0.000970</td>\n",
" <td>-0.246240</td>\n",
" <td>-0.068102</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>-1.34120</td>\n",
" <td>-1.162900</td>\n",
" <td>-0.029398</td>\n",
" <td>-0.135130</td>\n",
" <td>0.497580</td>\n",
" <td>-1.47260</td>\n",
" <td>-0.007152</td>\n",
" <td>-0.220570</td>\n",
" <td>0.101510</td>\n",
" <td>0.074106</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>-0.66152</td>\n",
" <td>0.849280</td>\n",
" <td>0.281970</td>\n",
" <td>0.030402</td>\n",
" <td>-0.294750</td>\n",
" <td>-0.43598</td>\n",
" <td>0.037590</td>\n",
" <td>0.289210</td>\n",
" <td>0.316870</td>\n",
" <td>-0.276840</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>-1.74920</td>\n",
" <td>1.705500</td>\n",
" <td>0.545230</td>\n",
" <td>0.648500</td>\n",
" <td>-0.044507</td>\n",
" <td>-0.95528</td>\n",
" <td>-1.418200</td>\n",
" <td>0.743170</td>\n",
" <td>-0.007359</td>\n",
" <td>-0.074117</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>-1.04130</td>\n",
" <td>0.246810</td>\n",
" <td>0.140320</td>\n",
" <td>-0.858140</td>\n",
" <td>0.474890</td>\n",
" <td>-0.45504</td>\n",
" <td>0.168170</td>\n",
" <td>-0.232990</td>\n",
" <td>0.061892</td>\n",
" <td>-0.133630</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6 \\\n",
"0 -4.19290 2.517100 0.473780 -0.628430 -2.687400 0.28546 0.191070 \n",
"1 -6.68060 2.969700 0.990980 2.466200 2.115500 -0.27363 -0.371340 \n",
"2 -0.25967 0.423830 0.089746 0.103350 -0.210260 -0.31418 0.092722 \n",
"3 -1.15640 0.848080 -0.259460 -0.411420 -1.437400 -0.43879 0.607720 \n",
"4 -0.66666 0.214390 0.041861 -0.129960 -0.363050 -0.39517 0.053495 \n",
"5 -0.17517 0.043968 -0.316620 -0.042977 0.122490 -0.28860 -0.040541 \n",
"6 -1.34120 -1.162900 -0.029398 -0.135130 0.497580 -1.47260 -0.007152 \n",
"7 -0.66152 0.849280 0.281970 0.030402 -0.294750 -0.43598 0.037590 \n",
"8 -1.74920 1.705500 0.545230 0.648500 -0.044507 -0.95528 -1.418200 \n",
"9 -1.04130 0.246810 0.140320 -0.858140 0.474890 -0.45504 0.168170 \n",
"\n",
" 7 8 9 10 \n",
"0 -0.127260 0.630760 -1.673800 0 \n",
"1 -0.212700 1.974800 1.557300 0 \n",
"2 0.262780 0.141460 0.009772 0 \n",
"3 0.121940 0.441890 0.005098 0 \n",
"4 0.076691 0.398830 -0.373070 0 \n",
"5 0.000970 -0.246240 -0.068102 0 \n",
"6 -0.220570 0.101510 0.074106 0 \n",
"7 0.289210 0.316870 -0.276840 0 \n",
"8 0.743170 -0.007359 -0.074117 1 \n",
"9 -0.232990 0.061892 -0.133630 0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"ds1.10.csv\", header=None)\n",
"df[:10]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def gen_points(df):\n",
" for row in df.iterrows():\n",
" row = list(row[1])\n",
" yield LabeledPoint(row[-1],row[1:-1])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"points = gen_points(df)\n",
"iterations=100\n",
"#list(points)[:10]\n",
"model = LogisticRegressionWithSGD.train(sc.parallelize(points), iterations)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Final weights: [-0.0669379287979,0.0168760757513,0.1144983594,-0.00741260878604,0.199330983325,0.103499320816,-0.00584475280362,-0.134381329648,0.169482551779]\n",
"Final intercept: 0.0\n"
]
}
],
"source": [
"print(\"Final weights: \" + str(model.weights))\n",
"print(\"Final intercept: \" + str(model.intercept))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment