Last active
October 24, 2018 07:02
-
-
Save dhwajraj/cf4e796ab67925ad0d89d6a015fc05fc to your computer and use it in GitHub Desktop.
iterative extract the noisy samples from training data which hamper the classifier learning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fout = open('classifier_votes.txt','w') | |
counter={} | |
for ll in range(100): | |
print(ll) | |
X_train, X_test, y_train, y_test, ix_train, ix_test = train_test_split(X, y, indices, test_size=0.2, random_state=ll) | |
classifiers=[] | |
classifiers.append(LogisticRegression(class_weight='balanced')) | |
classifiers.append(RandomForestClassifier(n_estimators=10, max_depth=4, random_state=0, | |
max_features=None,criterion="entropy", class_weight='balanced')) | |
classifiers.append(svm.SVC(C=100, gamma=0.001, kernel='rbf', class_weight={0.0:4.0})) | |
classifiers.append(DecisionTreeClassifier(class_weight='balanced', criterion="entropy", max_depth=3, min_samples_leaf=19, min_samples_split=11)) | |
classifiers.append(BaggingClassifier()) | |
classifiers.append(linear_model.SGDClassifier(max_iter=100, class_weight='balanced')) | |
classifiers.append(svm.LinearSVC(class_weight='balanced')) | |
pred1=[] | |
pred2=[] | |
for clf in classifiers: | |
clf.fit(X_train,y_train) | |
pred1.append(clf.predict(X_test)) | |
i=0 | |
for _,row in df.loc[ix_test].iterrows(): | |
s = "\t".join([str(p) for p in row.values]) | |
sum=0.0 | |
for j in range(7): | |
sum = sum+pred1[j][i] | |
if row['result']==1.0 and sum<=2.0: #0,1,2 | |
counter[s] = counter.get(s,0)+1 | |
if row['result']==0.0 and sum>=4.0: #4,5,6 | |
counter[s] = counter.get(s,0)+1 | |
i=i+1 | |
for key in counter: | |
fout.write(str(counter[key])+"\t"+key+"\n") | |
fout.flush() | |
fout.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment