Skip to content

Instantly share code, notes, and snippets.

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
import plotly.express as px
import numpy as np
import pandas as pd
from mlxtend.plotting import plot_learning_curves
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
# train data
df_train = pd.read_csv("/content/drive/My Drive/data/drugsComTest_raw.csv")
# test data
df_test = pd.read_csv("/content/drive/My Drive/data/drugsComTest_raw.csv")
# Removing nan values
df_train = df_train.dropna()
df_test = df_test.dropna()
def plot_bar_chart(df):
# analyze the condition labels
counts_series = df.condition.value_counts()
counts_df = pd.DataFrame(counts_series)
counts_df.reset_index(level=0, inplace=True)
number_of_classes(df)
fig = px.bar(counts_df, x="index", y="condition", orientation='v',
height=400,
# Keeping classes which have more than 20 values in them
index_counts = df_train["condition"].value_counts()[df_train.condition.value_counts() >= 20].index
df_train = df_train[df_train["condition"].isin(index_counts)]
number_of_classes(df_train)
# undersampling all classes with samples greater than 200 to 200
condition_over200 = df_train["condition"].value_counts()[df_train.condition.value_counts() >= 200].index
for condition in condition_over200:
# randomly shuffle the samples
condition_samples = df_train[df_train["condition"]==condition]
condition_samples = condition_samples.sample(frac=1).reset_index(drop=True)
# extract only 200
condition_samples = condition_samples[:200]
def filter_labels(labels):
labels = labels.tolist()
labels_truth = []
for label in labels:
if label[0].isdigit():
labels_truth.append(False)
else:
labels_truth.append(True)
return labels_truth
df_test = df_test[filter_labels(df_test["condition"])]
print("Test ", number_of_classes(df_test))
df_test = df_test[df_test["condition"].isin(df_train["condition"])]
number_of_classes(df_test)
import string
def filter_data(reviews):
"""
Filter the corpus of training and testing df.
This function removes stop and stem words from the corpus
:param reviews:
:return:
"""
df_train["Label"] = df_train["condition"].str.lower()
df_test["Label"] = df_test["condition"].str.lower()