Skip to content

Instantly share code, notes, and snippets.

@birolkuyumcu
Last active July 8, 2016 13:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save birolkuyumcu/e6e9132116127a7641abbda7a6f9ee34 to your computer and use it in GitHub Desktop.
Save birolkuyumcu/e6e9132116127a7641abbda7a6f9ee34 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
data_train = pd.read_csv( "data/datatraining.txt")
data_test = pd.read_csv( "data/datatest.txt")
data_train.columns
Index([u'date', u'Temperature', u'Humidity', u'Light', u'CO2',
       u'HumidityRatio', u'Occupancy'],
      dtype='object')
data_train.describe()
Temperature Humidity Light CO2 HumidityRatio Occupancy
count 8143.000000 8143.000000 8143.000000 8143.000000 8143.000000 8143.000000
mean 20.619084 25.731507 119.519375 606.546243 0.003863 0.212330
std 1.016916 5.531211 194.755805 314.320877 0.000852 0.408982
min 19.000000 16.745000 0.000000 412.750000 0.002674 0.000000
25% 19.700000 20.200000 0.000000 439.000000 0.003078 0.000000
50% 20.390000 26.222500 0.000000 453.500000 0.003801 0.000000
75% 21.390000 30.533333 256.375000 638.833333 0.004352 0.000000
max 23.180000 39.117500 1546.333333 2028.500000 0.006476 1.000000
data_train.head(5)
date Temperature Humidity Light CO2 HumidityRatio Occupancy
1 2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 1
2 2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 1
3 2015-02-04 17:53:00 23.15 27.2450 426.0 713.50 0.004779 1
4 2015-02-04 17:54:00 23.15 27.2000 426.0 708.25 0.004772 1
5 2015-02-04 17:55:00 23.10 27.2000 426.0 704.50 0.004757 1
data_train.drop(labels=['date'], axis=1,inplace=True)
out_train = data_train['Occupancy']
data_train.drop(labels=['Occupancy'], axis=1,inplace=True)
out_train = out_train.values
data_train = data_train.values
print 'Giriş : ',data_train.shape
print 'Çıkış :' ,out_train.shape
Giriş :  (8143L, 5L)
Çıkış : (8143L,)
data_test.drop(labels=['date'], axis=1,inplace=True)
out_test = data_test['Occupancy']
data_test.drop(labels=['Occupancy'], axis=1,inplace=True)
inLabels = data_test.columns.tolist()
out_test = out_test.values
data_test = data_test.values
inLabels
['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
classifier = RandomForestClassifier(n_estimators=25,max_depth= 3)
classifier=classifier.fit(data_train,out_train)
predictions=classifier.predict(data_test)
sklearn.metrics.confusion_matrix(out_test,predictions)
array([[1640,   53],
       [   5,  967]])
sklearn.metrics.accuracy_score(out_test,predictions)
0.97823639774859283
print(classifier.feature_importances_)
[ 0.06791678  0.01400845  0.61079644  0.28089006  0.02638828]
x_pos = list(range(len(inLabels)))
plt.bar(x_pos,classifier.feature_importances_,align='center')
plt.grid()
max_y = max(classifier.feature_importances_)
plt.ylim([0, max_y*1.1])
plt.ylabel('Importance')
plt.xticks(x_pos, inLabels)
plt.title('Importance of features')
plt.show()

png

from sklearn.linear_model import LassoLarsCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)
regModel = LassoLarsCV(cv=10)
regModel=regModel.fit(data_train,out_train)
rPredictions=regModel.predict(data_test)
thresh = 0.5
rPredictions[rPredictions>=thresh] = 1
rPredictions[rPredictions<thresh] = 0
sklearn.metrics.confusion_matrix(out_test,rPredictions)
array([[1637,   56],
       [   2,  970]])
sklearn.metrics.accuracy_score(out_test,rPredictions)
0.97823639774859283
x_pos = list(range(len(inLabels)))
plt.bar(x_pos,regModel.coef_,align='center')
plt.grid()
max_y = max(regModel.coef_)
plt.ylim([0, max_y*1.1])
plt.ylabel('Importance')
plt.xticks(x_pos, inLabels)
plt.title('Importance of features')
plt.show()
print(regModel.coef_)

png

[ 0.          0.          0.30397557  0.06647666  0.        ]
rPredictions=regModel.predict(data_test)
ptrain =regModel.predict(data_train)
xpos = []
accTest = []
accTrain = []
maxAc = 0.0
maxTh = 0
for i in range(999,0,-1) :
    thresh = i/1000.0
    tp1 = rPredictions.copy()
    tp2 = ptrain.copy()
    tp1[tp1>=thresh] = 1
    tp1[tp1<thresh] = 0
    tp2[tp2>=thresh] = 1
    tp2[tp2<thresh] = 0    
    a1 = sklearn.metrics.accuracy_score(out_test,tp1)
    a2 = sklearn.metrics.accuracy_score(out_train,tp2)
    xpos.append(thresh)
    a = (a1+a2)/2
    accTest.append(a1) 
    accTrain.append(a2)  
    if(a > maxAc ):
        maxAc = a
        maxTh = thresh   
plt.plot(xpos,accTest,'r',xpos,accTrain,'b')
plt.legend(('Test','Train'))
plt.grid()
plt.show()
i_max = xpos.index(maxTh)
print "Maksimum Accuracy Mean : ", maxAc," with threshold value :",maxTh
print "Maksimum Accuracy for Training Data : ",accTrain[i_max],"... for Testing :",accTest[i_max]

png

Maksimum Accuracy Mean :  0.983284967878  with threshold value : 0.602
Maksimum Accuracy for Training Data :  0.988333538008 ... for Testing : 0.978236397749
@birolkuyumcu
Copy link
Author

output_25_0
output_15_0
output_23_0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment