Skip to content

Instantly share code, notes, and snippets.

# training the Random Forest Classifier on complete training data
fin_clf = RandomForestClassifier(n_estimators=100)
fin_clf.fit(X_count_vec, Y)
# transforming test_data with count vectorizer
X_test_vec = count_vectorizer.transform(df_test['review'])
# getting preds on the test data
preds = fin_clf.predict(X_test_vec)
def cross_val_multiple_classifiers(X, Y):
classifiers = [MultinomialNB(), SGDClassifier(loss="modified_huber"),
RandomForestClassifier(n_estimators=100),
KNeighborsClassifier(n_neighbors=5)]
labels = ['Multinomial Naive Bayes', 'SGD Classifier', 'Random Forest', 'KNN']
clf_cv_mean = []
clf_cv_std = []
for clf, label in zip(classifiers, labels):
scores = cross_val_score(clf, X, Y, cv=4, scoring='accuracy')
print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
count_vectorizer = CountVectorizer(ngram_range=(1,2))
X_count_vec = count_vectorizer.fit_transform(X)
# shuffle the training dataframe and saving the columns in X and Y
df_train = df_train.sample(frac=1)
X = df_train['review']
Y = df_train['Label']
df_train["Label"] = df_train["condition"].str.lower()
df_test["Label"] = df_test["condition"].str.lower()
import string
def filter_data(reviews):
"""
Filter the corpus of training and testing df.
This function removes stop and stem words from the corpus
:param reviews:
:return:
"""
df_test = df_test[df_test["condition"].isin(df_train["condition"])]
number_of_classes(df_test)
df_test = df_test[filter_labels(df_test["condition"])]
print("Test ", number_of_classes(df_test))
def filter_labels(labels):
labels = labels.tolist()
labels_truth = []
for label in labels:
if label[0].isdigit():
labels_truth.append(False)
else:
labels_truth.append(True)
return labels_truth
# undersampling all classes with samples greater than 200 to 200
condition_over200 = df_train["condition"].value_counts()[df_train.condition.value_counts() >= 200].index
for condition in condition_over200:
# randomly shuffle the samples
condition_samples = df_train[df_train["condition"]==condition]
condition_samples = condition_samples.sample(frac=1).reset_index(drop=True)
# extract only 200
condition_samples = condition_samples[:200]