Created
March 25, 2016 14:39
-
-
Save chribsen/4575032abc5f29a29f5f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.cross_validation import cross_val_score | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.naive_bayes import MultinomialNB | |
import psycopg2 | |
import psycopg2.extras | |
from collections import Counter | |
from sklearn.feature_selection import VarianceThreshold | |
from sklearn.feature_selection import RFECV | |
from sklearn import metrics | |
from sklearn.cross_validation import KFold, StratifiedKFold | |
import json | |
import numpy as np | |
conn_dtu = psycopg2.connect('connstring') | |
cur_dtu = conn_dtu.cursor() #conn_dtu.cursor(cursor_factory=psycopg2.extras.DictCursor) | |
use_met_next_day = True | |
use_many_days = False | |
X = [] | |
y = [] | |
if use_met_next_day: | |
if use_many_days: | |
cur_dtu.execute(""" | |
select (select met_next_day | |
FROM derived_friend_list_days as a | |
WHERE a.user_a = DFL.user_a | |
AND a.user_b = DFL.user_b | |
AND a.date_day > '2015-06-01' limit 1)::int as met_later, | |
sum(DFL.nr_of_occurences), | |
DFF.same_camp_score::int, | |
DFF.same_genre_score::int, | |
DFF.country_only_in_dk::int, | |
DFF.country_both_visited_fc::int, | |
DFF.country_one_visited_fc::int, | |
DFF.country_both_visited_different_fc::int | |
from derived_friend_list_days AS DFL | |
INNER JOIN derived_friend_features AS DFF ON DFF.user_a = DFL.user_a AND DFF.user_b = DFL.user_b | |
WHERE DFL.date_day between '2015-06-27' and '2015-07-01' | |
group by | |
DFL.date_day, | |
met_later, | |
DFF.same_camp_score::int, | |
DFF.same_genre_score::int, | |
DFF.country_only_in_dk::int, | |
DFF.country_both_visited_fc::int, | |
DFF.country_one_visited_fc::int, | |
DFF.country_both_visited_different_fc::int | |
;""") | |
else: | |
cur_dtu.execute("""select DFL.met_next_day::int, DFL.nr_of_occurences::int, DFF.same_camp_score::int | |
from derived_friend_list_days AS DFL | |
INNER JOIN derived_friend_features AS DFF ON DFF.user_a = DFL.user_a AND DFF.user_b = DFL.user_b | |
WHERE DFL.date_day = '2015-06-28' and dfl.nr_of_occurences > 3;""") | |
for each in cur_dtu.fetchall(): | |
X.append(each[1:]) | |
y.append(each[0]) | |
else: | |
cur_dtu.execute("""select (select DFL2.nr_of_occurences from derived_friend_list_days as DFL2 where DFL2.user_a = DFF.user_a AND DFL2.user_b = DFF.user_b AND date_day= (DFL.date_day + '1 day'::interval)), DFL.nr_of_occurences, DFF.same_camp_score::int, DFF.same_genre_score::int, DFF.country_only_in_dk::int, DFF.country_both_visited_fc::int, DFF.country_one_visited_fc::int, DFF.country_both_visited_different_fc::int | |
from derived_friend_list_days AS DFL | |
INNER JOIN derived_friend_features AS DFF ON DFF.user_a = DFL.user_a AND DFF.user_b = DFL.user_b | |
WHERE DFL.date_day = '2015-06-29';""") | |
for each in cur_dtu.fetchall(): | |
X.append(each[1:]) | |
y.append(each[0]) | |
y = np.digitize(y, bins=range(1, 100, 8)) | |
y = [each if each is not None else 0 for each in y] | |
print(X[:10]) | |
#print('Max value of y: ' + str(max(y))) | |
#print('Min value of y: ' + str(min(y))) | |
print('Std dev of y: ' + str(np.array(y).std())) | |
print('Mean of y: ' + str(np.array(y).mean())) | |
print('Digitizing values ') | |
sel = VarianceThreshold(threshold=(.8 * (1 - .8))) | |
print('-'*20) | |
print('VarianceThreshold:') | |
print(sel.fit_transform(X)) | |
print('-'*20) | |
#cls = RandomForestClassifier(n_estimators=200) | |
#selector = RFECV(cls, step=1, cv=5) | |
#selector = selector.fit(X, y) | |
#print('N features selected: ') | |
#print(str(selector.n_features_)) | |
#print() | |
#print('Support: ') | |
#print(str(selector.support_ )) | |
#print() | |
#print('Ranking: ') | |
#print(str(selector.ranking_)) | |
#print() | |
#print('Grid scores: ') | |
#print(str(selector.grid_scores_)) | |
print('feature count: ' + str(len(y))) | |
print('frequency count' + str(dict(Counter(y)))) | |
print(str(X[:10])) | |
X = np.array(X) | |
y = np.array(y) | |
print('Random forest: ') | |
cls = RandomForestClassifier(n_estimators=200) | |
scores = cross_val_score(cls, X, y, cv=5) | |
print(scores.mean()) | |
print(scores) | |
print('Multinomial bayes: ') | |
cls = MultinomialNB() | |
scores = cross_val_score(cls, X, y, cv=5) | |
print(scores.mean()) | |
print(scores) | |
cls = RandomForestClassifier(n_estimators=200) | |
kf = KFold(len(X), n_folds=3) | |
for train, test in kf: | |
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] | |
cls.fit(X_train, y_train) | |
y_pred = cls.predict(X_test) | |
print('Confusion matrix for train fold: {0} and test fold: {1}'.format(str(train), str(test))) | |
conf_mat = metrics.confusion_matrix(y_test, y_pred) | |
print(conf_mat) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment