Skip to content

Instantly share code, notes, and snippets.

@dhwajraj
Last active October 24, 2018 07:02
Show Gist options
  • Save dhwajraj/cf4e796ab67925ad0d89d6a015fc05fc to your computer and use it in GitHub Desktop.
Save dhwajraj/cf4e796ab67925ad0d89d6a015fc05fc to your computer and use it in GitHub Desktop.
iterative extract the noisy samples from training data which hamper the classifier learning
fout = open('classifier_votes.txt','w')
counter={}
for ll in range(100):
print(ll)
X_train, X_test, y_train, y_test, ix_train, ix_test = train_test_split(X, y, indices, test_size=0.2, random_state=ll)
classifiers=[]
classifiers.append(LogisticRegression(class_weight='balanced'))
classifiers.append(RandomForestClassifier(n_estimators=10, max_depth=4, random_state=0,
max_features=None,criterion="entropy", class_weight='balanced'))
classifiers.append(svm.SVC(C=100, gamma=0.001, kernel='rbf', class_weight={0.0:4.0}))
classifiers.append(DecisionTreeClassifier(class_weight='balanced', criterion="entropy", max_depth=3, min_samples_leaf=19, min_samples_split=11))
classifiers.append(BaggingClassifier())
classifiers.append(linear_model.SGDClassifier(max_iter=100, class_weight='balanced'))
classifiers.append(svm.LinearSVC(class_weight='balanced'))
pred1=[]
pred2=[]
for clf in classifiers:
clf.fit(X_train,y_train)
pred1.append(clf.predict(X_test))
i=0
for _,row in df.loc[ix_test].iterrows():
s = "\t".join([str(p) for p in row.values])
sum=0.0
for j in range(7):
sum = sum+pred1[j][i]
if row['result']==1.0 and sum<=2.0: #0,1,2
counter[s] = counter.get(s,0)+1
if row['result']==0.0 and sum>=4.0: #4,5,6
counter[s] = counter.get(s,0)+1
i=i+1
for key in counter:
fout.write(str(counter[key])+"\t"+key+"\n")
fout.flush()
fout.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment