import pandas as pd from sklearn import tree from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestClassifier import sys from mlxtend.classifier import EnsembleVoteClassifier from sklearn import svm from imblearn.over_sampling import SMOTE verbose = False ratio = 'auto' print (sys.version) input_file = "TrainDataBinaryClassification.xls" df = pd.read_csv(input_file,header=0,sep=",") print(df.head()) print(df.head(5)) #Remove insignificant id column df.drop(['Id'],1,inplace=True) #List all column headers print(list(df)) #Fill missing values df = df.fillna(-999) features = list(df.columns[:-1]) print(features); y1 = df['class'] x1 = df[features] #Option 1 #SENN = SMOTEENN(ratio=ratio) #x, y = SENN.fit_sample(x1, y1) #Option #2 sm = SMOTE(kind='svm') x, y = sm.fit_sample(x1, y1) pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3) print('Shape of test data') rf = RandomForestClassifier(n_estimators=350) # initialize classifier2 = rf.fit(x, y) # fit the data to the algorithm pred_train, pred_test, tar_train, tar_test = train_test_split(x,y,test_size=0.3) print('Shape of test data') classifier = tree.DecisionTreeClassifier(criterion="entropy") classifier = classifier.fit(x,y) classifier3 = RandomForestClassifier(n_jobs=250) classifier3 = classifier3.fit(x,y) classifier2 = svm.SVC() classifier2 = classifier2.fit(x,y) clfs = [classifier, classifier2, classifier3] clf = EnsembleVoteClassifier(clfs, voting='hard', weights = (4,4,5)) clf.fit(x, y) input_file = "TestDataTwoClass.xls" df = pd.read_csv(input_file,header=0,sep=",") df2 = pd.read_csv(input_file,header=0,sep=",") df.drop(['Id'],1,inplace=True) df = df.fillna(-999) x = df[features] predictions = clf.predict(x) print('predictions') i = 0 for i in range(0,len(predictions)): print(predictions[i]) df['class'] = predictions df2['class'] = predictions print('count',df['class']) header = ["Id","class"] df2.to_csv("Results_Binary_Class_Adulteration_Sep18_2.csv", sep=',', columns = header,index=False)