Muhammad Hamiz Ahmed hamiz-ahmed

## final.py
# training the Random Forest Classifier on complete training data
fin_clf = RandomForestClassifier(n_estimators=100)
fin_clf.fit(X_count_vec, Y)

# transforming test_data with count vectorizer
X_test_vec = count_vectorizer.transform(df_test['review'])

# getting preds on the test data
preds = fin_clf.predict(X_test_vec)

## cross_val_multiple.py
def cross_val_multiple_classifiers(X, Y):
  classifiers = [MultinomialNB(), SGDClassifier(loss="modified_huber"),
                 RandomForestClassifier(n_estimators=100),
                 KNeighborsClassifier(n_neighbors=5)]
  labels = ['Multinomial Naive Bayes', 'SGD Classifier', 'Random Forest',  'KNN']
  clf_cv_mean = []
  clf_cv_std = []
  for clf, label in zip(classifiers, labels):
      scores = cross_val_score(clf, X, Y, cv=4, scoring='accuracy')
      print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))

## count_vec.py
count_vectorizer = CountVectorizer(ngram_range=(1,2))
X_count_vec = count_vectorizer.fit_transform(X)

## shuffle.py
# shuffle the training dataframe and saving the columns in X and Y
df_train = df_train.sample(frac=1)
X = df_train['review']
Y = df_train['Label']

## pre_process_5.py
df_train["Label"] = df_train["condition"].str.lower()
df_test["Label"] = df_test["condition"].str.lower()

## pre_process_4.py
import string

def filter_data(reviews):

  """
  Filter the corpus of training and testing df.
  This function removes stop and stem words from the corpus
  :param reviews:
  :return:
  """

## preprocess_3.py
df_test = df_test[df_test["condition"].isin(df_train["condition"])]
number_of_classes(df_test)

## preprocess_test.py
df_test = df_test[filter_labels(df_test["condition"])]
print("Test ", number_of_classes(df_test))

## filter_labels.py
def filter_labels(labels):
    labels = labels.tolist()
    labels_truth = []
    for label in labels:
        if label[0].isdigit():
            labels_truth.append(False)
        else:
            labels_truth.append(True)
    return labels_truth

## undersampling.py
# undersampling all classes with samples greater than 200 to 200
condition_over200 = df_train["condition"].value_counts()[df_train.condition.value_counts() >= 200].index

for condition in condition_over200:
    # randomly shuffle the samples
    condition_samples = df_train[df_train["condition"]==condition]
    condition_samples = condition_samples.sample(frac=1).reset_index(drop=True)

    # extract only 200
    condition_samples = condition_samples[:200]
	# training the Random Forest Classifier on complete training data
	fin_clf = RandomForestClassifier(n_estimators=100)
	fin_clf.fit(X_count_vec, Y)

	# transforming test_data with count vectorizer
	X_test_vec = count_vectorizer.transform(df_test['review'])

	# getting preds on the test data
	preds = fin_clf.predict(X_test_vec)
	def cross_val_multiple_classifiers(X, Y):
	classifiers = [MultinomialNB(), SGDClassifier(loss="modified_huber"),
	RandomForestClassifier(n_estimators=100),
	KNeighborsClassifier(n_neighbors=5)]
	labels = ['Multinomial Naive Bayes', 'SGD Classifier', 'Random Forest', 'KNN']
	clf_cv_mean = []
	clf_cv_std = []
	for clf, label in zip(classifiers, labels):
	scores = cross_val_score(clf, X, Y, cv=4, scoring='accuracy')
	print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
	count_vectorizer = CountVectorizer(ngram_range=(1,2))
	X_count_vec = count_vectorizer.fit_transform(X)
	# shuffle the training dataframe and saving the columns in X and Y
	df_train = df_train.sample(frac=1)
	X = df_train['review']
	Y = df_train['Label']
	df_train["Label"] = df_train["condition"].str.lower()
	df_test["Label"] = df_test["condition"].str.lower()
	import string

	def filter_data(reviews):

	"""
	Filter the corpus of training and testing df.
	This function removes stop and stem words from the corpus
	:param reviews:
	:return:
	"""
	df_test = df_test[df_test["condition"].isin(df_train["condition"])]
	number_of_classes(df_test)
	df_test = df_test[filter_labels(df_test["condition"])]
	print("Test ", number_of_classes(df_test))
	def filter_labels(labels):
	labels = labels.tolist()
	labels_truth = []
	for label in labels:
	if label[0].isdigit():
	labels_truth.append(False)
	else:
	labels_truth.append(True)
	return labels_truth
	# undersampling all classes with samples greater than 200 to 200
	condition_over200 = df_train["condition"].value_counts()[df_train.condition.value_counts() >= 200].index

	for condition in condition_over200:
	# randomly shuffle the samples
	condition_samples = df_train[df_train["condition"]==condition]
	condition_samples = condition_samples.sample(frac=1).reset_index(drop=True)

	# extract only 200
	condition_samples = condition_samples[:200]