Skip to content

Instantly share code, notes, and snippets.

@alfard
Last active August 29, 2015 14:20
Show Gist options
  • Save alfard/1f691dbb5916004995bd to your computer and use it in GitHub Desktop.
Save alfard/1f691dbb5916004995bd to your computer and use it in GitHub Desktop.
import numpy as np
import csv
import random
####TRAIN######################################################################
a=[]
f = open('/home/alfard/Documents/Kaggle/Loan/train20000.csv',"rb")
#f = open('/home/ubuntu/train_v2.csv',"rb")
#f = open('/home/ubuntu/train20000.csv',"rb")
fileopen = csv.reader(f,delimiter=',', quotechar='"')
for row in fileopen:
a.append(row)
f.close()
#del a[0]
a= np.array(a)
a[a=='NA']='NaN'
a = a.astype(np.float32)
np.random.seed(42)
np.random.shuffle(a)
tr, te = a[:10000,:], a[10000:,:]
#Balance tr
#tr1=np.array(tr[np.where( tr[:,770] > 0 )])
#tr0=np.array(random.sample(tr[np.where( tr[:,770] == 0 )], 9*len(tr1)))
#tr=np.vstack((tr1, tr0))
#np.random.shuffle(tr)
#Randomly dropping negative example
#k=np.where( tr[:,770] > 0)
#k=np.array(k)
#k=np.array(random.sample(k.T, 600))
#for i in range(0,len(k)):
# tr = np.delete(tr, k[i], 0)
#8000 10.35%
Ytr=tr[:,770]
YtrR=np.copy(Ytr)
Ytr[Ytr>0]=1
Xtr=tr[:,1:770]
del a
####TEST##############################################################################
Yte=te[:,770]
YteR=np.copy(Yte)
Yte[Yte>0]=1
Xte=te[:,1:770]
#######################################################################################
##Nettoyage des colonnes, que des NA ou valeur identiques
#print "Chargement Xt"
#Xt=np.load("/home/alfard/Documents/Kaggle/Loan/Xt12.npy")
#Xt=np.load("/home/ubuntu/Xt12.npy")
Xtr = np.delete(Xtr, 754, 1)
Xte = np.delete(Xte, 754, 1)
#Xt = np.delete(Xt, 754, 1)
Xtr = np.delete(Xtr, 726, 1)
Xte = np.delete(Xte, 726, 1)
#Xt = np.delete(Xt, 726, 1)
Xtr = np.delete(Xtr, 692, 1)
Xte = np.delete(Xte, 692, 1)
#Xt = np.delete(Xt, 692, 1)
Xtr = np.delete(Xtr, 691, 1)
Xte = np.delete(Xte, 691, 1)
#Xt = np.delete(Xt, 691, 1)
Xtr = np.delete(Xtr, 690, 1)
Xte = np.delete(Xte, 690, 1)
#Xt = np.delete(Xt, 690, 1)
Xtr = np.delete(Xtr, 668, 1)
Xte = np.delete(Xte, 668, 1)
#Xt = np.delete(Xt, 668, 1)
Xtr = np.delete(Xtr, 32, 1)
Xte = np.delete(Xte, 32, 1)
#Xt = np.delete(Xt, 32, 1)
Xtr = np.delete(Xtr, 31, 1)
Xte = np.delete(Xte, 31, 1)
#Xt = np.delete(Xt, 31, 1)
Xtr = np.delete(Xtr, 30, 1)
Xte = np.delete(Xte, 30, 1)
#Xt = np.delete(Xt, 30, 1)
##################################################################################
###Comparaison de colonne nombre de donnees identiques
###Ok
#Imputation
print "****imputation"
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(Xtr)
Xtr=imp.transform(Xtr)
#############################################################################################################
imp.fit(Xte)
Xte=imp.transform(Xte)
#imp.fit(Xt)
#Xt=imp.transform(Xt)
###################################################################################
#Dummy les 3 derniers colonnes
a=np.zeros(769,int)
b=np.zeros(769,int)
for i in range(755, 758):
C1=Xtr[:,i]
C2=Xte[:,i]
C3=Xtr[:,i]
temp1=np.intersect1d(C1,C2)
temp2=np.intersect1d(temp1,C3)
dummy = (C1[:, None] == temp2).astype(float)
Xtr=np.hstack((Xtr, dummy))
dummy = (C2[:, None] == temp2).astype(float)
Xte=np.hstack((Xte, dummy))
# dummy = (C3[:, None] == temp2).astype(float)
# Xt=np.hstack((Xt, dummy))
print i
a = np.insert(a,[i],i)
b = np.insert(b,[i],1)
c = np.vstack((a, b)).T
c = np.unique(c[c[:,1]==1])
#On efface les colonnes
for i in range(0, len(c)):
Xtr = np.delete(Xtr, c[i], 1)
Xte = np.delete(Xte, c[i], 1)
# Xt = np.delete(Xt , c[i], 1)
#######################################################################################################
#####################################################################
#print "preprocessing"
#from sklearn import preprocessing as pre
#Xtr=pre.StandardScaler().fit_transform(Xtr)
#Xte=pre.StandardScaler().fit_transform(Xte)
#Xtr = pre.scale(Xtr)
#Xte = pre.scale(Xte)
#####################################################################
#on degage correlation
#Correlation
print "****correlation"
npco=np.corrcoef(Xtr,rowvar=0)
npco=abs(npco)
wh=np.where(npco<0.7)
wh=np.array(wh)
#valeur unique
len(np.unique(wh))
filter = np.array(np.unique(wh)).tolist()
Xtr=Xtr[:,filter]
Xte=Xte[:,filter]
#Xt=Xt[:,filter]
#######################################################################
print "****variables selections"
#from sklearn.svm import SVC
#from sklearn.feature_selection import RFE
#svc = SVC(kernel="linear", C=1)
#rfe = RFE(estimator=svc, n_features_to_select=200, step=1)
#rfe.fit(Xtr, Ytr)
#Xtr=rfe.transform(Xtr)
#Xte=rfe.transform(Xte)
#Xt=rfe.transform(Xt)
#class_weight='auto'
#from sklearn.svm import LinearSVC
#lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, verbose = 2)
#lsvc.fit(Xtr, Ytr)
#Xtr = lsvc.transform(Xtr)
#Xte = lsvc.transform(Xte)
######################################################################
print "****Model"
#from sklearn.ensemble import GradientBoostingClassifier
#clf = GradientBoostingClassifier(n_estimators=300,verbose=1,max_depth=1)
import sklearn.linear_model as lm
clf = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001,
C=1.0, fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
#from sklearn.ensemble import GradientBoostingRegressor
#clf = GradientBoostingRegressor(n_estimators=100,loss='quantile',verbose=1,alpha=0.7)
#from sklearn.svm import LinearSVC
#clf = LinearSVC(loss = 'l2')
#from sklearn.linear_model import LogisticRegression
#clf = LogisticRegression(C=10,penalty='l2')
#clf = LogisticRegression()
#from sklearn import svm
#clf = svm.SVC()
clf.fit(Xtr, Ytr)
YtrP=clf.predict(Xtr)
print "Variable binaire Train"
print (1/float(len(Ytr)))*np.sum(abs(Ytr-YtrP))
#print clf.score(Xtr, Ytr)
print "Variable reel Train"
print (1/float(len(Ytr)))*np.sum(abs(YtrR-YtrP))
#print clf.score(Xtr, YtrR)
#print "Matrice ce confusion Train"
#from sklearn.metrics import confusion_matrix
#cmTr = confusion_matrix(Ytr, YtrP)
#print cmTr
############################################################################
YteP=clf.predict(Xte)
print "Variable binaire Test"
print (1/float(len(Yte)))*np.sum(abs(Yte-YteP))
#print clf.score(Xte,Yte)
print "Variable reel Test"
print (1/float(len(Yte)))*np.sum(abs(YteR-YteP))
#print clf.score(Xte,YteR)
#print "Matrice ce confusion Test"
#from sklearn.metrics import confusion_matrix
#cmTe = confusion_matrix(Yte, YteP)
#print cmTe
###############################################################################
#class_weight="auto"
#randomly dropping negative example
#{"loss": "lad", "n_estimators": 3000, "learning_rate": 0.035, "max_features": 80, "max_depth": 7, "subsample": 0.5}
################################################################################
del Xtr,Xte,Ytr,Yte,YtrP,YteP,YtrR,YteR
#################################################################################
#print "Chargement"
#Xt=np.load("/home/alfard/Documents/Kaggle/Loan/Xt12.npy")
#Xt=np.load("/home/ubuntu/Xt12.npy")
#b=[]
#print "Calcul du resultat"
#f = open('/home/ubuntu/test_v2.csv',"rb")
#fileopen = csv.reader(f,delimiter=',', quotechar='"')
#for row in fileopen:
# b.append(row)
#f.close()
#del b[0]
#b= np.array(b)
#b[b=='NA']='NaN'
#b = b.astype(np.float32)
#b.shape
#Id=b[:,0]
#Xt=b[:,1:770]
#np.save("/home/ubuntu/Xt12.npy", Xt)
#######################################################################
YtP=clf.predict(Xt)
#######################################################################
#Id=np.load("/home/alfard/Documents/Kaggle/Loan/IdXt.npy")
Id=np.load("/home/ubuntu/IdXt.npy")
RES=np.transpose(np.vstack((Id, YtP)))
np.savetxt("/home/ubuntu/gbmR12.csv", RES, delimiter=",",fmt='%g')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment