Last active
August 29, 2015 14:20
-
-
Save alfard/1f691dbb5916004995bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import csv | |
import random | |
####TRAIN###################################################################### | |
a=[] | |
f = open('/home/alfard/Documents/Kaggle/Loan/train20000.csv',"rb") | |
#f = open('/home/ubuntu/train_v2.csv',"rb") | |
#f = open('/home/ubuntu/train20000.csv',"rb") | |
fileopen = csv.reader(f,delimiter=',', quotechar='"') | |
for row in fileopen: | |
a.append(row) | |
f.close() | |
#del a[0] | |
a= np.array(a) | |
a[a=='NA']='NaN' | |
a = a.astype(np.float32) | |
np.random.seed(42) | |
np.random.shuffle(a) | |
tr, te = a[:10000,:], a[10000:,:] | |
#Balance tr | |
#tr1=np.array(tr[np.where( tr[:,770] > 0 )]) | |
#tr0=np.array(random.sample(tr[np.where( tr[:,770] == 0 )], 9*len(tr1))) | |
#tr=np.vstack((tr1, tr0)) | |
#np.random.shuffle(tr) | |
#Randomly dropping negative example | |
#k=np.where( tr[:,770] > 0) | |
#k=np.array(k) | |
#k=np.array(random.sample(k.T, 600)) | |
#for i in range(0,len(k)): | |
# tr = np.delete(tr, k[i], 0) | |
#8000 10.35% | |
Ytr=tr[:,770] | |
YtrR=np.copy(Ytr) | |
Ytr[Ytr>0]=1 | |
Xtr=tr[:,1:770] | |
del a | |
####TEST############################################################################## | |
Yte=te[:,770] | |
YteR=np.copy(Yte) | |
Yte[Yte>0]=1 | |
Xte=te[:,1:770] | |
####################################################################################### | |
##Nettoyage des colonnes, que des NA ou valeur identiques | |
#print "Chargement Xt" | |
#Xt=np.load("/home/alfard/Documents/Kaggle/Loan/Xt12.npy") | |
#Xt=np.load("/home/ubuntu/Xt12.npy") | |
Xtr = np.delete(Xtr, 754, 1) | |
Xte = np.delete(Xte, 754, 1) | |
#Xt = np.delete(Xt, 754, 1) | |
Xtr = np.delete(Xtr, 726, 1) | |
Xte = np.delete(Xte, 726, 1) | |
#Xt = np.delete(Xt, 726, 1) | |
Xtr = np.delete(Xtr, 692, 1) | |
Xte = np.delete(Xte, 692, 1) | |
#Xt = np.delete(Xt, 692, 1) | |
Xtr = np.delete(Xtr, 691, 1) | |
Xte = np.delete(Xte, 691, 1) | |
#Xt = np.delete(Xt, 691, 1) | |
Xtr = np.delete(Xtr, 690, 1) | |
Xte = np.delete(Xte, 690, 1) | |
#Xt = np.delete(Xt, 690, 1) | |
Xtr = np.delete(Xtr, 668, 1) | |
Xte = np.delete(Xte, 668, 1) | |
#Xt = np.delete(Xt, 668, 1) | |
Xtr = np.delete(Xtr, 32, 1) | |
Xte = np.delete(Xte, 32, 1) | |
#Xt = np.delete(Xt, 32, 1) | |
Xtr = np.delete(Xtr, 31, 1) | |
Xte = np.delete(Xte, 31, 1) | |
#Xt = np.delete(Xt, 31, 1) | |
Xtr = np.delete(Xtr, 30, 1) | |
Xte = np.delete(Xte, 30, 1) | |
#Xt = np.delete(Xt, 30, 1) | |
################################################################################## | |
###Comparaison de colonne nombre de donnees identiques | |
###Ok | |
#Imputation | |
print "****imputation" | |
from sklearn.preprocessing import Imputer | |
imp = Imputer(missing_values='NaN', strategy='mean', axis=0) | |
imp.fit(Xtr) | |
Xtr=imp.transform(Xtr) | |
############################################################################################################# | |
imp.fit(Xte) | |
Xte=imp.transform(Xte) | |
#imp.fit(Xt) | |
#Xt=imp.transform(Xt) | |
################################################################################### | |
#Dummy les 3 derniers colonnes | |
a=np.zeros(769,int) | |
b=np.zeros(769,int) | |
for i in range(755, 758): | |
C1=Xtr[:,i] | |
C2=Xte[:,i] | |
C3=Xtr[:,i] | |
temp1=np.intersect1d(C1,C2) | |
temp2=np.intersect1d(temp1,C3) | |
dummy = (C1[:, None] == temp2).astype(float) | |
Xtr=np.hstack((Xtr, dummy)) | |
dummy = (C2[:, None] == temp2).astype(float) | |
Xte=np.hstack((Xte, dummy)) | |
# dummy = (C3[:, None] == temp2).astype(float) | |
# Xt=np.hstack((Xt, dummy)) | |
print i | |
a = np.insert(a,[i],i) | |
b = np.insert(b,[i],1) | |
c = np.vstack((a, b)).T | |
c = np.unique(c[c[:,1]==1]) | |
#On efface les colonnes | |
for i in range(0, len(c)): | |
Xtr = np.delete(Xtr, c[i], 1) | |
Xte = np.delete(Xte, c[i], 1) | |
# Xt = np.delete(Xt , c[i], 1) | |
####################################################################################################### | |
##################################################################### | |
#print "preprocessing" | |
#from sklearn import preprocessing as pre | |
#Xtr=pre.StandardScaler().fit_transform(Xtr) | |
#Xte=pre.StandardScaler().fit_transform(Xte) | |
#Xtr = pre.scale(Xtr) | |
#Xte = pre.scale(Xte) | |
##################################################################### | |
#on degage correlation | |
#Correlation | |
print "****correlation" | |
npco=np.corrcoef(Xtr,rowvar=0) | |
npco=abs(npco) | |
wh=np.where(npco<0.7) | |
wh=np.array(wh) | |
#valeur unique | |
len(np.unique(wh)) | |
filter = np.array(np.unique(wh)).tolist() | |
Xtr=Xtr[:,filter] | |
Xte=Xte[:,filter] | |
#Xt=Xt[:,filter] | |
####################################################################### | |
print "****variables selections" | |
#from sklearn.svm import SVC | |
#from sklearn.feature_selection import RFE | |
#svc = SVC(kernel="linear", C=1) | |
#rfe = RFE(estimator=svc, n_features_to_select=200, step=1) | |
#rfe.fit(Xtr, Ytr) | |
#Xtr=rfe.transform(Xtr) | |
#Xte=rfe.transform(Xte) | |
#Xt=rfe.transform(Xt) | |
#class_weight='auto' | |
#from sklearn.svm import LinearSVC | |
#lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, verbose = 2) | |
#lsvc.fit(Xtr, Ytr) | |
#Xtr = lsvc.transform(Xtr) | |
#Xte = lsvc.transform(Xte) | |
###################################################################### | |
print "****Model" | |
#from sklearn.ensemble import GradientBoostingClassifier | |
#clf = GradientBoostingClassifier(n_estimators=300,verbose=1,max_depth=1) | |
import sklearn.linear_model as lm | |
clf = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, | |
C=1.0, fit_intercept=True, intercept_scaling=1.0, | |
class_weight=None, random_state=None) | |
#from sklearn.ensemble import GradientBoostingRegressor | |
#clf = GradientBoostingRegressor(n_estimators=100,loss='quantile',verbose=1,alpha=0.7) | |
#from sklearn.svm import LinearSVC | |
#clf = LinearSVC(loss = 'l2') | |
#from sklearn.linear_model import LogisticRegression | |
#clf = LogisticRegression(C=10,penalty='l2') | |
#clf = LogisticRegression() | |
#from sklearn import svm | |
#clf = svm.SVC() | |
clf.fit(Xtr, Ytr) | |
YtrP=clf.predict(Xtr) | |
print "Variable binaire Train" | |
print (1/float(len(Ytr)))*np.sum(abs(Ytr-YtrP)) | |
#print clf.score(Xtr, Ytr) | |
print "Variable reel Train" | |
print (1/float(len(Ytr)))*np.sum(abs(YtrR-YtrP)) | |
#print clf.score(Xtr, YtrR) | |
#print "Matrice ce confusion Train" | |
#from sklearn.metrics import confusion_matrix | |
#cmTr = confusion_matrix(Ytr, YtrP) | |
#print cmTr | |
############################################################################ | |
YteP=clf.predict(Xte) | |
print "Variable binaire Test" | |
print (1/float(len(Yte)))*np.sum(abs(Yte-YteP)) | |
#print clf.score(Xte,Yte) | |
print "Variable reel Test" | |
print (1/float(len(Yte)))*np.sum(abs(YteR-YteP)) | |
#print clf.score(Xte,YteR) | |
#print "Matrice ce confusion Test" | |
#from sklearn.metrics import confusion_matrix | |
#cmTe = confusion_matrix(Yte, YteP) | |
#print cmTe | |
############################################################################### | |
#class_weight="auto" | |
#randomly dropping negative example | |
#{"loss": "lad", "n_estimators": 3000, "learning_rate": 0.035, "max_features": 80, "max_depth": 7, "subsample": 0.5} | |
################################################################################ | |
del Xtr,Xte,Ytr,Yte,YtrP,YteP,YtrR,YteR | |
################################################################################# | |
#print "Chargement" | |
#Xt=np.load("/home/alfard/Documents/Kaggle/Loan/Xt12.npy") | |
#Xt=np.load("/home/ubuntu/Xt12.npy") | |
#b=[] | |
#print "Calcul du resultat" | |
#f = open('/home/ubuntu/test_v2.csv',"rb") | |
#fileopen = csv.reader(f,delimiter=',', quotechar='"') | |
#for row in fileopen: | |
# b.append(row) | |
#f.close() | |
#del b[0] | |
#b= np.array(b) | |
#b[b=='NA']='NaN' | |
#b = b.astype(np.float32) | |
#b.shape | |
#Id=b[:,0] | |
#Xt=b[:,1:770] | |
#np.save("/home/ubuntu/Xt12.npy", Xt) | |
####################################################################### | |
YtP=clf.predict(Xt) | |
####################################################################### | |
#Id=np.load("/home/alfard/Documents/Kaggle/Loan/IdXt.npy") | |
Id=np.load("/home/ubuntu/IdXt.npy") | |
RES=np.transpose(np.vstack((Id, YtP))) | |
np.savetxt("/home/ubuntu/gbmR12.csv", RES, delimiter=",",fmt='%g') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment