Skip to content

Instantly share code, notes, and snippets.

@chribsen
Created March 25, 2016 14:39
Show Gist options
  • Save chribsen/4575032abc5f29a29f5f to your computer and use it in GitHub Desktop.
Save chribsen/4575032abc5f29a29f5f to your computer and use it in GitHub Desktop.
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import psycopg2
import psycopg2.extras
from collections import Counter
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV
from sklearn import metrics
from sklearn.cross_validation import KFold, StratifiedKFold
import json
import numpy as np
conn_dtu = psycopg2.connect('connstring')
cur_dtu = conn_dtu.cursor() #conn_dtu.cursor(cursor_factory=psycopg2.extras.DictCursor)
use_met_next_day = True
use_many_days = False
X = []
y = []
if use_met_next_day:
if use_many_days:
cur_dtu.execute("""
select (select met_next_day
FROM derived_friend_list_days as a
WHERE a.user_a = DFL.user_a
AND a.user_b = DFL.user_b
AND a.date_day > '2015-06-01' limit 1)::int as met_later,
sum(DFL.nr_of_occurences),
DFF.same_camp_score::int,
DFF.same_genre_score::int,
DFF.country_only_in_dk::int,
DFF.country_both_visited_fc::int,
DFF.country_one_visited_fc::int,
DFF.country_both_visited_different_fc::int
from derived_friend_list_days AS DFL
INNER JOIN derived_friend_features AS DFF ON DFF.user_a = DFL.user_a AND DFF.user_b = DFL.user_b
WHERE DFL.date_day between '2015-06-27' and '2015-07-01'
group by
DFL.date_day,
met_later,
DFF.same_camp_score::int,
DFF.same_genre_score::int,
DFF.country_only_in_dk::int,
DFF.country_both_visited_fc::int,
DFF.country_one_visited_fc::int,
DFF.country_both_visited_different_fc::int
;""")
else:
cur_dtu.execute("""select DFL.met_next_day::int, DFL.nr_of_occurences::int, DFF.same_camp_score::int
from derived_friend_list_days AS DFL
INNER JOIN derived_friend_features AS DFF ON DFF.user_a = DFL.user_a AND DFF.user_b = DFL.user_b
WHERE DFL.date_day = '2015-06-28' and dfl.nr_of_occurences > 3;""")
for each in cur_dtu.fetchall():
X.append(each[1:])
y.append(each[0])
else:
cur_dtu.execute("""select (select DFL2.nr_of_occurences from derived_friend_list_days as DFL2 where DFL2.user_a = DFF.user_a AND DFL2.user_b = DFF.user_b AND date_day= (DFL.date_day + '1 day'::interval)), DFL.nr_of_occurences, DFF.same_camp_score::int, DFF.same_genre_score::int, DFF.country_only_in_dk::int, DFF.country_both_visited_fc::int, DFF.country_one_visited_fc::int, DFF.country_both_visited_different_fc::int
from derived_friend_list_days AS DFL
INNER JOIN derived_friend_features AS DFF ON DFF.user_a = DFL.user_a AND DFF.user_b = DFL.user_b
WHERE DFL.date_day = '2015-06-29';""")
for each in cur_dtu.fetchall():
X.append(each[1:])
y.append(each[0])
y = np.digitize(y, bins=range(1, 100, 8))
y = [each if each is not None else 0 for each in y]
print(X[:10])
#print('Max value of y: ' + str(max(y)))
#print('Min value of y: ' + str(min(y)))
print('Std dev of y: ' + str(np.array(y).std()))
print('Mean of y: ' + str(np.array(y).mean()))
print('Digitizing values ')
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
print('-'*20)
print('VarianceThreshold:')
print(sel.fit_transform(X))
print('-'*20)
#cls = RandomForestClassifier(n_estimators=200)
#selector = RFECV(cls, step=1, cv=5)
#selector = selector.fit(X, y)
#print('N features selected: ')
#print(str(selector.n_features_))
#print()
#print('Support: ')
#print(str(selector.support_ ))
#print()
#print('Ranking: ')
#print(str(selector.ranking_))
#print()
#print('Grid scores: ')
#print(str(selector.grid_scores_))
print('feature count: ' + str(len(y)))
print('frequency count' + str(dict(Counter(y))))
print(str(X[:10]))
X = np.array(X)
y = np.array(y)
print('Random forest: ')
cls = RandomForestClassifier(n_estimators=200)
scores = cross_val_score(cls, X, y, cv=5)
print(scores.mean())
print(scores)
print('Multinomial bayes: ')
cls = MultinomialNB()
scores = cross_val_score(cls, X, y, cv=5)
print(scores.mean())
print(scores)
cls = RandomForestClassifier(n_estimators=200)
kf = KFold(len(X), n_folds=3)
for train, test in kf:
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
cls.fit(X_train, y_train)
y_pred = cls.predict(X_test)
print('Confusion matrix for train fold: {0} and test fold: {1}'.format(str(train), str(test)))
conf_mat = metrics.confusion_matrix(y_test, y_pred)
print(conf_mat)
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment