Created
June 22, 2022 20:57
-
-
Save rajeshpv/0c4b8b6ae0e4f8b1ae8300eb87bfd69c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Python 3.9.13 (tags/v3.9.13:6de2ca5, May 17 2022, 16:36:42) [MSC v.1929 64 bit (AMD64)] | |
Type "copyright", "credits" or "license" for more information. | |
IPython -- An enhanced Interactive Python. | |
In [1]: import numpy as np # import numpy into Python and use it under its alias 'np' | |
...: import pandas as pd # import pandas into Python and use it under its alias 'pd' | |
...: import statsmodels.api as sm # import statsmodels.api into Python and use it under its alias 'sm' | |
...: import sklearn # import sklearn into Python | |
...: import matplotlib.pyplot as plt # import matplotlib.pyplot into Python and use it under its alias 'plt' | |
...: import seaborn as sns # import the seaborn library and use it under its alias 'sns' | |
...: from scipy import stats,integrate | |
...: import pylab # import pylab into Python | |
...: import scipy.stats as stats # import scipy.stats into Python and use it under its alias 'stats' | |
...: | |
...: # Now we can read/import the data into Python. The data is included in a typical .csv data file. | |
In [2]: Telecom_Churn_Data = pd.read_csv("C:/Users/rao8r/Downloads/mod-7/to-sandhya/TelecomCustomerChurnData.csv", sep =",") | |
In [3]: Telecom_Churn_Data.head(20) # show the first 20 rows in the data | |
Out[3]: | |
Tenure PhoneService ... TotalCharges Churn | |
0 1 No ... 29.85 0 | |
1 34 Yes ... 1889.50 0 | |
2 2 Yes ... 108.15 1 | |
3 45 No ... 1840.75 0 | |
4 2 Yes ... 151.65 1 | |
5 8 Yes ... 820.50 1 | |
6 22 Yes ... 1949.40 0 | |
7 10 No ... 301.90 0 | |
8 28 Yes ... 3046.05 1 | |
9 62 Yes ... 3487.95 0 | |
10 13 Yes ... 587.45 0 | |
11 16 Yes ... 326.80 0 | |
12 58 Yes ... 5681.10 0 | |
13 49 Yes ... 5036.30 1 | |
14 25 Yes ... 2686.05 0 | |
15 69 Yes ... 7895.15 0 | |
16 52 Yes ... 1022.95 0 | |
17 71 Yes ... 7382.25 0 | |
18 10 Yes ... 528.35 1 | |
19 21 Yes ... 1862.90 0 | |
[20 rows x 7 columns] | |
In [4]: Telecom_Churn_Data.tail(20) # show the last 20 rows in the data | |
Out[4]: | |
Tenure PhoneService ... TotalCharges Churn | |
7011 72 Yes ... 7544.30 0 | |
7012 63 Yes ... 6479.40 0 | |
7013 44 Yes ... 3626.35 0 | |
7014 18 Yes ... 1679.40 0 | |
7015 9 Yes ... 403.35 1 | |
7016 13 Yes ... 931.55 0 | |
7017 68 Yes ... 4326.25 0 | |
7018 6 No ... 263.05 0 | |
7019 2 Yes ... 39.25 0 | |
7020 55 Yes ... 3316.10 0 | |
7021 1 Yes ... 75.75 1 | |
7022 38 Yes ... 2625.25 0 | |
7023 67 Yes ... 6886.25 1 | |
7024 19 Yes ... 1495.10 0 | |
7025 12 No ... 743.30 0 | |
7026 72 Yes ... 1419.40 0 | |
7027 24 Yes ... 1990.50 0 | |
7028 72 Yes ... 7362.90 0 | |
7029 11 No ... 346.45 0 | |
7030 4 Yes ... 306.60 1 | |
[20 rows x 7 columns] | |
In [5]: Telecom_Churn_Data.columns.tolist() # show the names of columns/variables in the data | |
Out[5]: | |
['Tenure', | |
'PhoneService', | |
'Contract', | |
'PaperlessBilling', | |
'PaymentMethod', | |
'TotalCharges', | |
'Churn'] | |
In [6]: Telecom_Churn_Data.shape # output the dimension of the Telecom_Churn_Data object. This is similar to dim() in R | |
Out[6]: (7031, 7) | |
In [7]: Telecom_Churn_Data.dtypes # show the data types of the variables in the data | |
Out[7]: | |
Tenure int64 | |
PhoneService object | |
Contract object | |
PaperlessBilling object | |
PaymentMethod object | |
TotalCharges float64 | |
Churn int64 | |
dtype: object | |
In [8]: Stat_summary_table = Telecom_Churn_Data.describe().T # output the statistics for all the variables. | |
In [9]: check_missing_value = Telecom_Churn_Data.isnull().sum(axis=1) | |
In [10]: check_missing_value[check_missing_value!=0].count() | |
Out[10]: 0 | |
In [11]: Telecom_Churn_Data.info() # you can check whther there are "null's" in each variable. | |
<class 'pandas.core.frame.DataFrame'> | |
RangeIndex: 7031 entries, 0 to 7030 | |
Data columns (total 7 columns): | |
# Column Non-Null Count Dtype | |
--- ------ -------------- ----- | |
0 Tenure 7031 non-null int64 | |
1 PhoneService 7031 non-null object | |
2 Contract 7031 non-null object | |
3 PaperlessBilling 7031 non-null object | |
4 PaymentMethod 7031 non-null object | |
5 TotalCharges 7031 non-null float64 | |
6 Churn 7031 non-null int64 | |
dtypes: float64(1), int64(2), object(4) | |
memory usage: 384.6+ KB | |
In [12]: List_Cate_Var = ["Tenure","PhoneService", "Contract", "PaperlessBilling", "PaymentMethod", "TotalCharges", "Churn"] | |
In [13]: for var_name in List_Cate_Var: | |
...: print("The Frequency Table of the", var_name, "Variable") | |
...: print(Telecom_Churn_Data[var_name].value_counts()) # generate the frequency table for each categorical variable in the loop | |
The Frequency Table of the Tenure Variable | |
1 613 | |
72 362 | |
2 238 | |
3 200 | |
4 176 | |
... | |
38 59 | |
28 57 | |
39 56 | |
44 51 | |
36 50 | |
Name: Tenure, Length: 72, dtype: int64 | |
The Frequency Table of the PhoneService Variable | |
Yes 6351 | |
No 680 | |
Name: PhoneService, dtype: int64 | |
The Frequency Table of the Contract Variable | |
Month-to-month 3875 | |
Two year 1684 | |
One year 1472 | |
Name: Contract, dtype: int64 | |
The Frequency Table of the PaperlessBilling Variable | |
Yes 4167 | |
No 2864 | |
Name: PaperlessBilling, dtype: int64 | |
The Frequency Table of the PaymentMethod Variable | |
Electronic check 2365 | |
Mailed check 1604 | |
Bank transfer (automatic) 1541 | |
Credit card (automatic) 1521 | |
Name: PaymentMethod, dtype: int64 | |
The Frequency Table of the TotalCharges Variable | |
20.20 11 | |
19.75 9 | |
19.90 8 | |
20.05 8 | |
19.65 8 | |
.. | |
6849.40 1 | |
692.35 1 | |
130.15 1 | |
3211.90 1 | |
306.60 1 | |
Name: TotalCharges, Length: 6529, dtype: int64 | |
The Frequency Table of the Churn Variable | |
0 5162 | |
1 1869 | |
Name: Churn, dtype: int64 | |
In [14]: plt.figure() # open a new figure window | |
Out[14]: <Figure size 432x288 with 0 Axes><Figure size 432x288 with 0 Axes> | |
In [15]: plt.hist(Telecom_Churn_Data['Tenure'], bins=25) # generate a histogram with the number of bins = 25 | |
Out[15]: | |
(array([1051., 419., 373., 332., 284., 264., 144., 238., 252., | |
201., 206., 217., 115., 179., 200., 186., 198., 216., | |
138., 209., 203., 218., 244., 293., 651.]), | |
array([ 1. , 3.84, 6.68, 9.52, 12.36, 15.2 , 18.04, 20.88, 23.72, | |
26.56, 29.4 , 32.24, 35.08, 37.92, 40.76, 43.6 , 46.44, 49.28, | |
52.12, 54.96, 57.8 , 60.64, 63.48, 66.32, 69.16, 72. ]), | |
<BarContainer object of 25 artists>) | |
Warning | |
Figures now render in the Plots pane by default. To make them also appear inline in the Console, uncheck "Mute Inline Plotting" under the Plots pane options menu. | |
In [16]: plt.hist(Telecom_Churn_Data['Tenure'], bins=50) # generate a histogram with the number of bins = 25 | |
Out[16]: | |
(array([851., 200., 309., 110., 254., 119., 116., 216., 109., 175., 80., | |
184., 73., 71., 153., 85., 173., 79., 72., 129., 72., 134., | |
64., 153., 50., 65., 115., 64., 135., 65., 112., 74., 68., | |
130., 68., 148., 70., 68., 144., 65., 127., 76., 146., 72., | |
80., 164., 98., 195., 119., 532.]), | |
array([ 1. , 2.42, 3.84, 5.26, 6.68, 8.1 , 9.52, 10.94, 12.36, | |
13.78, 15.2 , 16.62, 18.04, 19.46, 20.88, 22.3 , 23.72, 25.14, | |
26.56, 27.98, 29.4 , 30.82, 32.24, 33.66, 35.08, 36.5 , 37.92, | |
39.34, 40.76, 42.18, 43.6 , 45.02, 46.44, 47.86, 49.28, 50.7 , | |
52.12, 53.54, 54.96, 56.38, 57.8 , 59.22, 60.64, 62.06, 63.48, | |
64.9 , 66.32, 67.74, 69.16, 70.58, 72. ]), | |
<BarContainer object of 50 artists>) | |
In [17]: plt.hist(Telecom_Churn_Data['Tenure'], bins=10) # generate a histogram with the number of bins = 25 | |
Out[17]: | |
(array([1724., 735., 561., 538., 473., 444., 452., 495., 501., | |
1108.]), | |
array([ 1. , 8.1, 15.2, 22.3, 29.4, 36.5, 43.6, 50.7, 57.8, 64.9, 72. ]), | |
<BarContainer object of 10 artists>) | |
In [18]: plt.xlabel("Tenure") # add a x-label | |
Out[18]: Text(0.5, 0, 'Tenure') | |
In [19]: plt.ylabel("Frequency") # add a y-label | |
Out[19]: Text(0, 0.5, 'Frequency') | |
In [20]: plt.title("Histogram of Tenure") # add a title for the chart | |
Out[20]: Text(0.5, 1.0, 'Histogram of Tenure') | |
In [21]: plt.figure() # open a new figure window | |
...: plt.xlabel("Tenure") # add a x-label | |
...: plt.ylabel("Frequency") # add a y-label | |
...: plt.title("Histogram of Tenure") # add a title for the chart | |
...: plt.hist(Telecom_Churn_Data['Tenure'], bins=25) # generate a histogram with the number of bins = 25 | |
...: # here bins = 10 means that 25 classes/groups are specified | |
Out[21]: | |
(array([1051., 419., 373., 332., 284., 264., 144., 238., 252., | |
201., 206., 217., 115., 179., 200., 186., 198., 216., | |
138., 209., 203., 218., 244., 293., 651.]), | |
array([ 1. , 3.84, 6.68, 9.52, 12.36, 15.2 , 18.04, 20.88, 23.72, | |
26.56, 29.4 , 32.24, 35.08, 37.92, 40.76, 43.6 , 46.44, 49.28, | |
52.12, 54.96, 57.8 , 60.64, 63.48, 66.32, 69.16, 72. ]), | |
<BarContainer object of 25 artists>) | |
In [22]: plt.figure() # open a new figure window | |
...: sns.kdeplot(Telecom_Churn_Data['Tenure']) # 'kde' here means a kernel density | |
Out[22]: <AxesSubplot:xlabel='Tenure', ylabel='Density'> | |
In [23]: plt.figure() # open a new figure window | |
...: sns.distplot(Telecom_Churn_Data['Tenure']) # as you can see that sns.distplot automatically choose | |
C:\Users\rao8r\AppData\Local\Programs\Python\Python39\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). | |
warnings.warn(msg, FutureWarning) | |
Out[23]: <AxesSubplot:xlabel='Tenure', ylabel='Density'> | |
In [24]: plt.figure() # open a new figure window | |
...: sns.boxplot(Telecom_Churn_Data['Tenure']) | |
C:\Users\rao8r\AppData\Local\Programs\Python\Python39\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. | |
warnings.warn( | |
Out[24]: <AxesSubplot:xlabel='Tenure'> | |
In [25]: | |
...: plt.figure() # open a new figure window | |
...: plt.xlabel("TotalCharges") # add a x-label | |
...: plt.ylabel("Frequency") # add a y-label | |
...: plt.title("Histogram of TotalCharges") # add a title for the chart | |
...: plt.hist(Telecom_Churn_Data['TotalCharges'], bins=25) # generate a histogram with the number of bins = 25 | |
...: # here bins = 10 means that 25 classes/groups are specified | |
Out[25]: | |
(array([1678., 728., 587., 536., 408., 318., 243., 215., 208., | |
195., 173., 196., 165., 174., 158., 146., 164., 148., | |
135., 111., 101., 87., 77., 55., 25.]), | |
array([ 18.8 , 365.44, 712.08, 1058.72, 1405.36, 1752. , 2098.64, | |
2445.28, 2791.92, 3138.56, 3485.2 , 3831.84, 4178.48, 4525.12, | |
4871.76, 5218.4 , 5565.04, 5911.68, 6258.32, 6604.96, 6951.6 , | |
7298.24, 7644.88, 7991.52, 8338.16, 8684.8 ]), | |
<BarContainer object of 25 artists>) | |
In [26]: | |
...: plt.figure() # open a new figure window | |
...: plt.xlabel("TotalCharges") # add a x-label | |
...: plt.ylabel("Frequency") # add a y-label | |
...: plt.title("Histogram of TotalCharges") # add a title for the chart | |
...: plt.hist(Telecom_Churn_Data['TotalCharges'], bins=15) # generate a histogram with the number of bins = 25 | |
...: # here bins = 10 means that 25 classes/groups are specified | |
Out[26]: | |
(array([2197., 970., 770., 477., 364., 338., 309., 281., 276., | |
267., 236., 201., 157., 130., 58.]), | |
array([ 18.8 , 596.53333333, 1174.26666667, 1752. , | |
2329.73333333, 2907.46666667, 3485.2 , 4062.93333333, | |
4640.66666667, 5218.4 , 5796.13333333, 6373.86666667, | |
6951.6 , 7529.33333333, 8107.06666667, 8684.8 ]), | |
<BarContainer object of 15 artists>) | |
In [27]: plt.figure() # open a new figure window | |
...: sns.kdeplot(Telecom_Churn_Data['TotalCharges']) # 'kde' here means a kernel density | |
Out[27]: <AxesSubplot:xlabel='TotalCharges', ylabel='Density'> | |
In [28]: | |
...: plt.figure() # open a new figure window | |
...: sns.distplot(Telecom_Churn_Data['TotalCharges']) # as you can see that sns.distplot automatically choose | |
C:\Users\rao8r\AppData\Local\Programs\Python\Python39\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). | |
warnings.warn(msg, FutureWarning) | |
Out[28]: <AxesSubplot:xlabel='TotalCharges', ylabel='Density'> | |
In [29]: | |
...: | |
...: plt.figure() # open a new figure window | |
...: sns.boxplot(Telecom_Churn_Data['TotalCharges']) | |
C:\Users\rao8r\AppData\Local\Programs\Python\Python39\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. | |
warnings.warn( | |
Out[29]: <AxesSubplot:xlabel='TotalCharges'> | |
In [30]: from scipy.stats import chi2_contingency # import the chi2_contingency module from the scipy.stats | |
In [31]: chi2_contingency(pd.crosstab(Telecom_Churn_Data['Churn'], Telecom_Churn_Data['Tenure'])) | |
Out[31]: | |
(1059.9449494025498, | |
4.084542332671357e-176, | |
71, | |
array([[450.05063291, 174.73417722, 146.83544304, 129.21518987, | |
97.64556962, 80.75949367, 96.17721519, 90.30379747, | |
87.36708861, 85.16455696, 72.6835443 , 85.89873418, | |
80.02531646, 55.79746835, 72.6835443 , 58.73417722, | |
63.87341772, 71.21518987, 53.59493671, 52.12658228, | |
46.25316456, 66.07594937, 62.40506329, 69.01265823, | |
58. , 58. , 52.86075949, 41.84810127, | |
52.86075949, 52.86075949, 47.72151899, 50.65822785, | |
46.98734177, 47.72151899, 64.60759494, 36.70886076, | |
47.72151899, 43.3164557 , 41.11392405, 46.98734177, | |
51.39240506, 47.72151899, 47.72151899, 37.44303797, | |
44.78481013, 54.32911392, 49.92405063, 46.98734177, | |
48.4556962 , 49.92405063, 49.92405063, 58.73417722, | |
51.39240506, 49.92405063, 46.98734177, 58.73417722, | |
47.72151899, 49.18987342, 44.05063291, 55.79746835, | |
55.79746835, 51.39240506, 52.86075949, 58.73417722, | |
55.79746835, 64.60759494, 71.94936709, 73.41772152, | |
69.74683544, 87.36708861, 124.81012658, 265.7721519 ], | |
[162.94936709, 63.26582278, 53.16455696, 46.78481013, | |
35.35443038, 29.24050633, 34.82278481, 32.69620253, | |
31.63291139, 30.83544304, 26.3164557 , 31.10126582, | |
28.97468354, 20.20253165, 26.3164557 , 21.26582278, | |
23.12658228, 25.78481013, 19.40506329, 18.87341772, | |
16.74683544, 23.92405063, 22.59493671, 24.98734177, | |
21. , 21. , 19.13924051, 15.15189873, | |
19.13924051, 19.13924051, 17.27848101, 18.34177215, | |
17.01265823, 17.27848101, 23.39240506, 13.29113924, | |
17.27848101, 15.6835443 , 14.88607595, 17.01265823, | |
18.60759494, 17.27848101, 17.27848101, 13.55696203, | |
16.21518987, 19.67088608, 18.07594937, 17.01265823, | |
17.5443038 , 18.07594937, 18.07594937, 21.26582278, | |
18.60759494, 18.07594937, 17.01265823, 21.26582278, | |
17.27848101, 17.81012658, 15.94936709, 20.20253165, | |
20.20253165, 18.60759494, 19.13924051, 21.26582278, | |
20.20253165, 23.39240506, 26.05063291, 26.58227848, | |
25.25316456, 31.63291139, 45.18987342, 96.2278481 ]])) | |
In [32]: chi2_contingency(pd.crosstab(Telecom_Churn_Data['Churn'], Telecom_Churn_Data['PhoneService'])) | |
Out[32]: | |
(0.8780612721866795, | |
0.3487332316441293, | |
1, | |
array([[ 499.24050633, 4662.75949367], | |
[ 180.75949367, 1688.24050633]])) | |
In [33]: chi2_contingency(pd.crosstab(Telecom_Churn_Data['Churn'], Telecom_Churn_Data['Contract'])) | |
Out[33]: | |
(1179.080956754422, | |
9.243221963845809e-257, | |
2, | |
array([[2844.93670886, 1080.70886076, 1236.35443038], | |
[1030.06329114, 391.29113924, 447.64556962]])) | |
In [34]: chi2_contingency(pd.crosstab(Telecom_Churn_Data['Churn'], Telecom_Churn_Data['PaperlessBilling'])) | |
Out[34]: | |
(257.06741761837947, | |
7.477609371069561e-58, | |
1, | |
array([[2102.6835443, 3059.3164557], | |
[ 761.3164557, 1107.6835443]])) | |
In [35]: chi2_contingency(pd.crosstab(Telecom_Churn_Data['Churn'], Telecom_Churn_Data['PaymentMethod'])) | |
Out[35]: | |
(645.1528611114002, | |
1.6378691391382898e-139, | |
3, | |
array([[1131.36708861, 1116.6835443 , 1736.32911392, 1177.62025316], | |
[ 409.63291139, 404.3164557 , 628.67088608, 426.37974684]])) | |
In [36]: chi2_contingency(pd.crosstab(Telecom_Churn_Data['Churn'], Telecom_Churn_Data['TotalCharges'])) | |
Out[36]: | |
(6502.6232970687415, | |
0.5857175785198268, | |
6528, | |
array([[0.73417722, 1.46835443, 0.73417722, ..., 0.73417722, 0.73417722, | |
0.73417722], | |
[0.26582278, 0.53164557, 0.26582278, ..., 0.26582278, 0.26582278, | |
0.26582278]])) | |
Author
rajeshpv
commented
Jun 22, 2022
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment