Skip to content

Instantly share code, notes, and snippets.

@kr-shiveshwar
Last active January 30, 2018 00:20
Show Gist options
  • Save kr-shiveshwar/ad1bc3e68daa0509f1edba75c81a83af to your computer and use it in GitHub Desktop.
Save kr-shiveshwar/ad1bc3e68daa0509f1edba75c81a83af to your computer and use it in GitHub Desktop.
Churn_Prediction_Telecom
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pyspark.sql import SQLContext\n",
"from pyspark.sql.types import *\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sqlContext = SQLContext(sc)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"Importing Data to pyspark"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"churn_data = sqlContext.read \\\n",
" .format('com.databricks.spark.csv') \\\n",
" .load('/home/shiveshwar/Programs/Telecom_churn/TelcomData.csv',header='true',inferSchema='true')"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>State</th>\n",
" <th>VMail_Plan</th>\n",
" <th>International_Plan</th>\n",
" <th>Total_Day_Minutes</th>\n",
" <th>Total_Day_Calls</th>\n",
" <th>Total_Day_Charge</th>\n",
" <th>Total_Evening_Minutes</th>\n",
" <th>Total_Evening_Calls</th>\n",
" <th>Total_Evening_Charge</th>\n",
" <th>Total_Night_Minutes</th>\n",
" <th>Total_Night_Calls</th>\n",
" <th>Total_Night_Charge</th>\n",
" <th>Total_International_Minutes</th>\n",
" <th>Total_International_Calls</th>\n",
" <th>Total_International_Charge</th>\n",
" <th>Customer_Service_Calls</th>\n",
" <th>Number_VMail_Messages</th>\n",
" <th>Churn</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>OH</td>\n",
" <td>Yes</td>\n",
" <td>Yes</td>\n",
" <td>752.85</td>\n",
" <td>83</td>\n",
" <td>45.1710</td>\n",
" <td>1027.74</td>\n",
" <td>99</td>\n",
" <td>30.8322</td>\n",
" <td>851.61</td>\n",
" <td>85</td>\n",
" <td>17.0322</td>\n",
" <td>900.08</td>\n",
" <td>76</td>\n",
" <td>270.024</td>\n",
" <td>55</td>\n",
" <td>88</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>OK</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>862.92</td>\n",
" <td>84</td>\n",
" <td>51.7752</td>\n",
" <td>827.74</td>\n",
" <td>79</td>\n",
" <td>24.8322</td>\n",
" <td>932.64</td>\n",
" <td>86</td>\n",
" <td>18.6528</td>\n",
" <td>972.73</td>\n",
" <td>93</td>\n",
" <td>291.819</td>\n",
" <td>47</td>\n",
" <td>103</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AL</td>\n",
" <td>Yes</td>\n",
" <td>Yes</td>\n",
" <td>468.03</td>\n",
" <td>45</td>\n",
" <td>28.0818</td>\n",
" <td>588.86</td>\n",
" <td>57</td>\n",
" <td>17.6658</td>\n",
" <td>662.87</td>\n",
" <td>55</td>\n",
" <td>13.2574</td>\n",
" <td>513.69</td>\n",
" <td>62</td>\n",
" <td>154.107</td>\n",
" <td>33</td>\n",
" <td>64</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>MA</td>\n",
" <td>Yes</td>\n",
" <td>Yes</td>\n",
" <td>743.11</td>\n",
" <td>64</td>\n",
" <td>44.5866</td>\n",
" <td>582.09</td>\n",
" <td>46</td>\n",
" <td>17.4627</td>\n",
" <td>601.45</td>\n",
" <td>50</td>\n",
" <td>12.0290</td>\n",
" <td>699.86</td>\n",
" <td>55</td>\n",
" <td>209.958</td>\n",
" <td>37</td>\n",
" <td>60</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>MO</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>333.45</td>\n",
" <td>29</td>\n",
" <td>20.0070</td>\n",
" <td>439.42</td>\n",
" <td>42</td>\n",
" <td>13.1826</td>\n",
" <td>366.06</td>\n",
" <td>35</td>\n",
" <td>7.3212</td>\n",
" <td>367.30</td>\n",
" <td>34</td>\n",
" <td>110.190</td>\n",
" <td>22</td>\n",
" <td>34</td>\n",
" <td>No</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" State VMail_Plan International_Plan Total_Day_Minutes Total_Day_Calls \\\n",
"0 OH Yes Yes 752.85 83 \n",
"1 OK No Yes 862.92 84 \n",
"2 AL Yes Yes 468.03 45 \n",
"3 MA Yes Yes 743.11 64 \n",
"4 MO Yes No 333.45 29 \n",
"\n",
" Total_Day_Charge Total_Evening_Minutes Total_Evening_Calls \\\n",
"0 45.1710 1027.74 99 \n",
"1 51.7752 827.74 79 \n",
"2 28.0818 588.86 57 \n",
"3 44.5866 582.09 46 \n",
"4 20.0070 439.42 42 \n",
"\n",
" Total_Evening_Charge Total_Night_Minutes Total_Night_Calls \\\n",
"0 30.8322 851.61 85 \n",
"1 24.8322 932.64 86 \n",
"2 17.6658 662.87 55 \n",
"3 17.4627 601.45 50 \n",
"4 13.1826 366.06 35 \n",
"\n",
" Total_Night_Charge Total_International_Minutes Total_International_Calls \\\n",
"0 17.0322 900.08 76 \n",
"1 18.6528 972.73 93 \n",
"2 13.2574 513.69 62 \n",
"3 12.0290 699.86 55 \n",
"4 7.3212 367.30 34 \n",
"\n",
" Total_International_Charge Customer_Service_Calls Number_VMail_Messages \\\n",
"0 270.024 55 88 \n",
"1 291.819 47 103 \n",
"2 154.107 33 64 \n",
"3 209.958 37 60 \n",
"4 110.190 22 34 \n",
"\n",
" Churn \n",
"0 No \n",
"1 No \n",
"2 Yes \n",
"3 No \n",
"4 No "
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(churn_data.take(5),columns=churn_data.columns)"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"At this point there are String values which we need to convert to numeric before we can apply to the model."
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pyspark.sql.types import DoubleType\n",
"from pyspark.sql.functions import UserDefinedFunction"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"binary_map = {'Yes':1.0, 'No':0.0, 'True':1.0, 'False':0.0}\n",
"toNum = UserDefinedFunction(lambda k: binary_map[k], DoubleType())"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"churn_data = churn_data.drop('State') \\\n",
" .drop('Total_Day_Charge').drop('Total_Evening_Charge') \\\n",
" .drop('Total_Night_Charge').drop('Total_International_Charge') \\\n",
" .withColumn('Churn', toNum(churn_data['Churn']))\\\n",
" .withColumn('VMail_Plan', toNum(churn_data['VMail_Plan']))\\\n",
" .withColumn('International_Plan', toNum(churn_data['International_Plan'])).cache()"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>VMail_Plan</th>\n",
" <th>International_Plan</th>\n",
" <th>Total_Day_Minutes</th>\n",
" <th>Total_Day_Calls</th>\n",
" <th>Total_Evening_Minutes</th>\n",
" <th>Total_Evening_Calls</th>\n",
" <th>Total_Night_Minutes</th>\n",
" <th>Total_Night_Calls</th>\n",
" <th>Total_International_Minutes</th>\n",
" <th>Total_International_Calls</th>\n",
" <th>Customer_Service_Calls</th>\n",
" <th>Number_VMail_Messages</th>\n",
" <th>Churn</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>752.85</td>\n",
" <td>83</td>\n",
" <td>1027.74</td>\n",
" <td>99</td>\n",
" <td>851.61</td>\n",
" <td>85</td>\n",
" <td>900.08</td>\n",
" <td>76</td>\n",
" <td>55</td>\n",
" <td>88</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>862.92</td>\n",
" <td>84</td>\n",
" <td>827.74</td>\n",
" <td>79</td>\n",
" <td>932.64</td>\n",
" <td>86</td>\n",
" <td>972.73</td>\n",
" <td>93</td>\n",
" <td>47</td>\n",
" <td>103</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>468.03</td>\n",
" <td>45</td>\n",
" <td>588.86</td>\n",
" <td>57</td>\n",
" <td>662.87</td>\n",
" <td>55</td>\n",
" <td>513.69</td>\n",
" <td>62</td>\n",
" <td>33</td>\n",
" <td>64</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>743.11</td>\n",
" <td>64</td>\n",
" <td>582.09</td>\n",
" <td>46</td>\n",
" <td>601.45</td>\n",
" <td>50</td>\n",
" <td>699.86</td>\n",
" <td>55</td>\n",
" <td>37</td>\n",
" <td>60</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>333.45</td>\n",
" <td>29</td>\n",
" <td>439.42</td>\n",
" <td>42</td>\n",
" <td>366.06</td>\n",
" <td>35</td>\n",
" <td>367.30</td>\n",
" <td>34</td>\n",
" <td>22</td>\n",
" <td>34</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" VMail_Plan International_Plan Total_Day_Minutes Total_Day_Calls \\\n",
"0 1.0 1.0 752.85 83 \n",
"1 0.0 1.0 862.92 84 \n",
"2 1.0 1.0 468.03 45 \n",
"3 1.0 1.0 743.11 64 \n",
"4 1.0 0.0 333.45 29 \n",
"\n",
" Total_Evening_Minutes Total_Evening_Calls Total_Night_Minutes \\\n",
"0 1027.74 99 851.61 \n",
"1 827.74 79 932.64 \n",
"2 588.86 57 662.87 \n",
"3 582.09 46 601.45 \n",
"4 439.42 42 366.06 \n",
"\n",
" Total_Night_Calls Total_International_Minutes Total_International_Calls \\\n",
"0 85 900.08 76 \n",
"1 86 972.73 93 \n",
"2 55 513.69 62 \n",
"3 50 699.86 55 \n",
"4 35 367.30 34 \n",
"\n",
" Customer_Service_Calls Number_VMail_Messages Churn \n",
"0 55 88 0.0 \n",
"1 47 103 0.0 \n",
"2 33 64 1.0 \n",
"3 37 60 0.0 \n",
"4 22 34 0.0 "
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(churn_data.take(5),columns=churn_data.columns)"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"training_data, testing_data = churn_data.randomSplit([0.8, 0.2])"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pyspark.ml.feature import StringIndexer\n",
"from pyspark.ml.feature import VectorAssembler"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"label_indexer = StringIndexer(inputCol = 'Churn', outputCol = 'label')\n",
"\n",
"reduced_numeric_cols = ['VMail_Plan', 'International_Plan', 'Total_Day_Minutes',\n",
" 'Total_Day_Calls', 'Total_Evening_Minutes', 'Total_Evening_Calls',\n",
" 'Total_Night_Minutes', 'Total_Night_Calls',\n",
" 'Total_International_Minutes', 'Total_International_Calls',\n",
" 'Customer_Service_Calls', 'Number_VMail_Messages']\n",
"\n",
"assembler = VectorAssembler(inputCols = reduced_numeric_cols,outputCol = 'features')"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.classification import RandomForestClassifier\n",
"from pyspark.mllib.tree import DecisionTree\n",
" \n",
"classifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'features')\n",
"pipeline = Pipeline(stages=[label_indexer, assembler, classifier])"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = pipeline.fit(training_data)"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pyspark.ml.evaluation import BinaryClassificationEvaluator"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"predictions = model.transform(testing_data)\n",
"evaluator = BinaryClassificationEvaluator()\n",
"auroc = evaluator.evaluate(predictions, {evaluator.metricName: \"areaUnderROC\"})"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.5101919305127485"
]
},
"execution_count": 156,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"auroc"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment