This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://zetcode.com/python/prettytable/ | |
from prettytable import PrettyTable | |
x = PrettyTable() | |
x.field_names = ["Vectoriser","Model" ,"Hyper Parameter K or Aplha","Train_AUC","Test_AUC"] | |
x.add_row(["BOW", "Random","No",0.90,0.90 ]) | |
x.add_row(["BOW", "KNN","81",0.35,0.27 ]) | |
x.add_row(["BOW", "Naive Bayes","0.16",0.35,0.27 ]) | |
x.add_row(["BOW", "Logistic Regression","0.001",0.17,0.26]) | |
x.add_row(["BOW", "Linear SVM","0.001",0.36,0.25]) | |
x.add_row(["TF-IDF", "Random","No",0.90,0.90 ]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Removing HTML Tag , \r tags, \n (enter) with space Removed all Special Character | |
from tqdm import tqdm | |
preprocessed_question_title= [] | |
# tqdm is for printing the status bar | |
for sentance in tqdm(final_dataset['question_title'].values): | |
sentance = sentance.replace('\r', ' ') | |
sentance = sentance.replace('\\"', ' ') | |
sentance = sentance.replace('\n', ' ') | |
sentance = re.sub(r"http\S+", "", sentance) | |
sentance = re.sub(r'[^\w]', ' ', sentance) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
temp = train_data["host"].value_counts() | |
df = pd.DataFrame({'labels': temp.index, | |
'values': temp.values | |
}) | |
df.iplot(kind='pie',labels='labels',values='values', title='Distribution of hosts in Training data') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
temp = train_data["category"].value_counts() | |
#print("Total number of states : ",len(temp)) | |
trace = go.Bar( | |
x = temp.index, | |
y = (temp / temp.sum())*100, | |
) | |
data = [trace] | |
layout = go.Layout( | |
title = "Distribution of categories in training data in % ", | |
xaxis=dict( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig, axes = plt.subplots(6, 5, figsize=(18, 15)) | |
axes = axes.ravel() | |
bins = np.linspace(0, 1, 20) | |
for i, col in enumerate(targets): | |
ax = axes[i] | |
sns.distplot(train_data[col], label=col, kde=False, bins=bins, ax=ax) | |
# ax.set_title(col) | |
ax.set_xlim([0, 1]) | |
ax.set_ylim([0, 6079]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(23,13)) | |
plt.subplot(321) | |
venn2([set(train_data.question_user_name.unique()), set(test_data.question_user_name.unique())], set_labels = ('Train set', 'Test set') ) | |
plt.title("Common question_user_name in training and test data", fontsize=15) | |
plt.subplot(322) | |
venn2([set(train_data.answer_user_name.unique()), set(test_data.answer_user_name.unique())], set_labels = ('Train set', 'Test set') ) | |
plt.title("Common answer_user_name in training and test data", fontsize=15) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train_question_title=train_data['question_title'].str.len() | |
test_question_title=test_data['question_title'].str.len() | |
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,6)) | |
sns.distplot(train_question_title,ax=ax1,color='blue') | |
sns.distplot(test_question_title,ax=ax2,color='green') | |
ax2.set_title('Distribution for Question Title in test data') | |
ax1.set_title('Distribution for Question Title in Training data') | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sns.boxplot(y=train_question_title,data=train_data) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(12, 12)) | |
sns.heatmap(data=train_data[answer_related_target_cols].corr(), | |
square=True, | |
annot=True, | |
linewidths=1, | |
cmap=sns.color_palette("Blues")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Number of characters in the text | |
train_data["question_title_num_chars"] = train_data["question_title"].apply(lambda x: len(str(x))) | |
train_data["question_body_num_chars"] = train_data["question_body"].apply(lambda x: len(str(x))) | |
train_data["answer_num_chars"] = train_data["answer"].apply(lambda x: len(str(x))) |
OlderNewer