Skip to content

Instantly share code, notes, and snippets.

# http://zetcode.com/python/prettytable/
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Vectoriser","Model" ,"Hyper Parameter K or Aplha","Train_AUC","Test_AUC"]
x.add_row(["BOW", "Random","No",0.90,0.90 ])
x.add_row(["BOW", "KNN","81",0.35,0.27 ])
x.add_row(["BOW", "Naive Bayes","0.16",0.35,0.27 ])
x.add_row(["BOW", "Logistic Regression","0.001",0.17,0.26])
x.add_row(["BOW", "Linear SVM","0.001",0.36,0.25])
x.add_row(["TF-IDF", "Random","No",0.90,0.90 ])
# Removing HTML Tag , \r tags, \n (enter) with space Removed all Special Character
from tqdm import tqdm
preprocessed_question_title= []
# tqdm is for printing the status bar
for sentance in tqdm(final_dataset['question_title'].values):
sentance = sentance.replace('\r', ' ')
sentance = sentance.replace('\\"', ' ')
sentance = sentance.replace('\n', ' ')
sentance = re.sub(r"http\S+", "", sentance)
sentance = re.sub(r'[^\w]', ' ', sentance)
temp = train_data["host"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Distribution of hosts in Training data')
temp = train_data["category"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
title = "Distribution of categories in training data in % ",
xaxis=dict(
fig, axes = plt.subplots(6, 5, figsize=(18, 15))
axes = axes.ravel()
bins = np.linspace(0, 1, 20)
for i, col in enumerate(targets):
ax = axes[i]
sns.distplot(train_data[col], label=col, kde=False, bins=bins, ax=ax)
# ax.set_title(col)
ax.set_xlim([0, 1])
ax.set_ylim([0, 6079])
plt.figure(figsize=(23,13))
plt.subplot(321)
venn2([set(train_data.question_user_name.unique()), set(test_data.question_user_name.unique())], set_labels = ('Train set', 'Test set') )
plt.title("Common question_user_name in training and test data", fontsize=15)
plt.subplot(322)
venn2([set(train_data.answer_user_name.unique()), set(test_data.answer_user_name.unique())], set_labels = ('Train set', 'Test set') )
plt.title("Common answer_user_name in training and test data", fontsize=15)
train_question_title=train_data['question_title'].str.len()
test_question_title=test_data['question_title'].str.len()
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,6))
sns.distplot(train_question_title,ax=ax1,color='blue')
sns.distplot(test_question_title,ax=ax2,color='green')
ax2.set_title('Distribution for Question Title in test data')
ax1.set_title('Distribution for Question Title in Training data')
plt.show()
sns.boxplot(y=train_question_title,data=train_data)
plt.show()
plt.figure(figsize=(12, 12))
sns.heatmap(data=train_data[answer_related_target_cols].corr(),
square=True,
annot=True,
linewidths=1,
cmap=sns.color_palette("Blues"))
# Number of characters in the text
train_data["question_title_num_chars"] = train_data["question_title"].apply(lambda x: len(str(x)))
train_data["question_body_num_chars"] = train_data["question_body"].apply(lambda x: len(str(x)))
train_data["answer_num_chars"] = train_data["answer"].apply(lambda x: len(str(x)))