Skip to content

Instantly share code, notes, and snippets.

View notha99y's full-sized avatar
:octocat:
loading...

Ren Jie notha99y

:octocat:
loading...
View GitHub Profile
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', size = 15)
colormap = sns.diverging_palette(10, 220, as_cmap = True)
sns.heatmap(corr_df.corr(),
cmap = colormap,
square = True,
annot = True,
linewidths=0.1,vmax=1.0, linecolor='white',
annot_kws={'fontsize':12 })
plt.show()
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
sample_train,sample_val, gt_train, gt_val = train_test_split(train_df,
train_df['Survived'],
test_size=0.05,
random_state=99)
sample_val_processed = simple_preprocessing(sample_val, train = False)
sample_val_processed = scaler.fit_transform(sample_val_processed)
mergings = linkage(sample_val_processed, method='complete')
def simple_preprocessing(dataframe, train=True):
le = LabelEncoder()
X = dataframe.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], axis=1)
X['Age'] = X['Age'].fillna(value=X['Age'].mode()[0])
X['Embarked'] = le.fit_transform(X['Embarked'].fillna(value=X['Embarked'].mode()[0]))
X['Sex'] = np.where(X['Sex'] == 'male', 1, 0)
if train:
X = X.drop(['Survived'], axis=1)
y = np.where(dataframe['Survived'] == 1, 'Alive', 'Dead')
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(max_depth=12, learning_rate=1e-4,n_estimators=500)
xgb_clf.fit(X_train, np.argmax(np.array(y_train), axis = 1))
xgb_y_pred = xgb_clf.predict(X_val)
pd.Series(xgb_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh',
figsize = (10, 10),
title = 'Feature importance from XGBoost').invert_yaxis();
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators = 500, max_depth=12)
rf_clf.fit(X_train, y_train)
rf_y_pred = rf_clf.predict(X_val)
pd.Series(rf_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh',
figsize = (10, 10),
title = 'Feature importance from RandomForest').invert_yaxis();
# multivariate analysis with Embarked variable and Pclass variable
quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Embarked', hue = 'Pclass', palette=c_palette3, verbose=False, swarm=False)
# bivariate analysis with target variable
quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Survived', palette=c_palette, verbose=False, swarm=True)
# univariate analysis
quantitative_summarized(dataframe= train_df, y = 'Age', palette=c_palette, verbose=False, swarm=True)
def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False):
'''
Helper function that gives a quick summary of quantattive data
Arguments
=========
dataframe: pandas dataframe
x: str. horizontal axis to plot the labels of categorical data (usually the target variable)
y: str. vertical axis to plot the quantitative data
hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable)
# Feature Variable: Gender
categorical_summarized(train_df, y = 'Sex', hue='Survived', palette=c_palette)