birolkuyumcu/occupy.md

## occupy.md

      
    Raw
  

              occupy.md
            
          
    import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
data_train = pd.read_csv( "data/datatraining.txt")
data_test = pd.read_csv( "data/datatest.txt")
data_train.columns
Index([u'date', u'Temperature', u'Humidity', u'Light', u'CO2',
       u'HumidityRatio', u'Occupancy'],
      dtype='object')

data_train.describe()


      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
      count
      8143.000000
      8143.000000
      8143.000000
      8143.000000
      8143.000000
      8143.000000
    
    
      mean
      20.619084
      25.731507
      119.519375
      606.546243
      0.003863
      0.212330
    
    
      std
      1.016916
      5.531211
      194.755805
      314.320877
      0.000852
      0.408982
    
    
      min
      19.000000
      16.745000
      0.000000
      412.750000
      0.002674
      0.000000
    
    
      25%
      19.700000
      20.200000
      0.000000
      439.000000
      0.003078
      0.000000
    
    
      50%
      20.390000
      26.222500
      0.000000
      453.500000
      0.003801
      0.000000
    
    
      75%
      21.390000
      30.533333
      256.375000
      638.833333
      0.004352
      0.000000
    
    
      max
      23.180000
      39.117500
      1546.333333
      2028.500000
      0.006476
      1.000000
    
  
data_train.head(5)


      date
      Temperature
      Humidity
      Light
      CO2
      HumidityRatio
      Occupancy
    
  
      1
      2015-02-04 17:51:00
      23.18
      27.2720
      426.0
      721.25
      0.004793
      1
    
    
      2
      2015-02-04 17:51:59
      23.15
      27.2675
      429.5
      714.00
      0.004783
      1
    
    
      3
      2015-02-04 17:53:00
      23.15
      27.2450
      426.0
      713.50
      0.004779
      1
    
    
      4
      2015-02-04 17:54:00
      23.15
      27.2000
      426.0
      708.25
      0.004772
      1
    
    
      5
      2015-02-04 17:55:00
      23.10
      27.2000
      426.0
      704.50
      0.004757
      1
    
  
data_train.drop(labels=['date'], axis=1,inplace=True)
out_train = data_train['Occupancy']
data_train.drop(labels=['Occupancy'], axis=1,inplace=True)
out_train = out_train.values
data_train = data_train.values
print 'Giriş : ',data_train.shape
print 'Çıkış :' ,out_train.shape
Giriş :  (8143L, 5L)
Çıkış : (8143L,)

data_test.drop(labels=['date'], axis=1,inplace=True)
out_test = data_test['Occupancy']
data_test.drop(labels=['Occupancy'], axis=1,inplace=True)
inLabels = data_test.columns.tolist()
out_test = out_test.values
data_test = data_test.values
inLabels
['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']

from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
classifier = RandomForestClassifier(n_estimators=25,max_depth= 3)
classifier=classifier.fit(data_train,out_train)
predictions=classifier.predict(data_test)
sklearn.metrics.confusion_matrix(out_test,predictions)
array([[1640,   53],
       [   5,  967]])

sklearn.metrics.accuracy_score(out_test,predictions)
0.97823639774859283

print(classifier.feature_importances_)
[ 0.06791678  0.01400845  0.61079644  0.28089006  0.02638828]

x_pos = list(range(len(inLabels)))
plt.bar(x_pos,classifier.feature_importances_,align='center')
plt.grid()
max_y = max(classifier.feature_importances_)
plt.ylim([0, max_y*1.1])
plt.ylabel('Importance')
plt.xticks(x_pos, inLabels)
plt.title('Importance of features')
plt.show()

from sklearn.linear_model import LassoLarsCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)
regModel = LassoLarsCV(cv=10)
regModel=regModel.fit(data_train,out_train)
rPredictions=regModel.predict(data_test)
thresh = 0.5
rPredictions[rPredictions>=thresh] = 1
rPredictions[rPredictions<thresh] = 0
sklearn.metrics.confusion_matrix(out_test,rPredictions)
array([[1637,   56],
       [   2,  970]])

sklearn.metrics.accuracy_score(out_test,rPredictions)
0.97823639774859283

x_pos = list(range(len(inLabels)))
plt.bar(x_pos,regModel.coef_,align='center')
plt.grid()
max_y = max(regModel.coef_)
plt.ylim([0, max_y*1.1])
plt.ylabel('Importance')
plt.xticks(x_pos, inLabels)
plt.title('Importance of features')
plt.show()
print(regModel.coef_)

[ 0.          0.          0.30397557  0.06647666  0.        ]

rPredictions=regModel.predict(data_test)
ptrain =regModel.predict(data_train)
xpos = []
accTest = []
accTrain = []
maxAc = 0.0
maxTh = 0
for i in range(999,0,-1) :
    thresh = i/1000.0
    tp1 = rPredictions.copy()
    tp2 = ptrain.copy()
    tp1[tp1>=thresh] = 1
    tp1[tp1<thresh] = 0
    tp2[tp2>=thresh] = 1
    tp2[tp2<thresh] = 0    
    a1 = sklearn.metrics.accuracy_score(out_test,tp1)
    a2 = sklearn.metrics.accuracy_score(out_train,tp2)
    xpos.append(thresh)
    a = (a1+a2)/2
    accTest.append(a1) 
    accTrain.append(a2)  
    if(a > maxAc ):
        maxAc = a
        maxTh = thresh   
plt.plot(xpos,accTest,'r',xpos,accTrain,'b')
plt.legend(('Test','Train'))
plt.grid()
plt.show()
i_max = xpos.index(maxTh)
print "Maksimum Accuracy Mean : ", maxAc," with threshold value :",maxTh
print "Maksimum Accuracy for Training Data : ",accTrain[i_max],"... for Testing :",accTest[i_max]

Maksimum Accuracy Mean :  0.983284967878  with threshold value : 0.602
Maksimum Accuracy for Training Data :  0.988333538008 ... for Testing : 0.978236397749
	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
count	8143.000000	8143.000000	8143.000000	8143.000000	8143.000000	8143.000000
mean	20.619084	25.731507	119.519375	606.546243	0.003863	0.212330
std	1.016916	5.531211	194.755805	314.320877	0.000852	0.408982
min	19.000000	16.745000	0.000000	412.750000	0.002674	0.000000
25%	19.700000	20.200000	0.000000	439.000000	0.003078	0.000000
50%	20.390000	26.222500	0.000000	453.500000	0.003801	0.000000
75%	21.390000	30.533333	256.375000	638.833333	0.004352	0.000000
max	23.180000	39.117500	1546.333333	2028.500000	0.006476	1.000000
	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
1	2015-02-04 17:51:00	23.18	27.2720	426.0	721.25	0.004793	1
2	2015-02-04 17:51:59	23.15	27.2675	429.5	714.00	0.004783	1
3	2015-02-04 17:53:00	23.15	27.2450	426.0	713.50	0.004779	1
4	2015-02-04 17:54:00	23.15	27.2000	426.0	708.25	0.004772	1
5	2015-02-04 17:55:00	23.10	27.2000	426.0	704.50	0.004757	1