Ren Jie notha99y

## pearson_cc_heatmap.py
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', size = 15)
colormap = sns.diverging_palette(10, 220, as_cmap = True)
sns.heatmap(corr_df.corr(),
            cmap = colormap,
            square = True,
            annot = True,
            linewidths=0.1,vmax=1.0, linecolor='white',
            annot_kws={'fontsize':12 })
plt.show()

## hierarchical_clustering_titanic.py
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
sample_train,sample_val, gt_train, gt_val = train_test_split(train_df,
                                                             train_df['Survived'],
                                                             test_size=0.05,
                                                             random_state=99)

sample_val_processed = simple_preprocessing(sample_val, train = False)
sample_val_processed = scaler.fit_transform(sample_val_processed)
mergings = linkage(sample_val_processed, method='complete')

## simple_preprocessing_titanic.py
def simple_preprocessing(dataframe, train=True):
    le = LabelEncoder()
    X = dataframe.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], axis=1)
    X['Age'] = X['Age'].fillna(value=X['Age'].mode()[0])
    X['Embarked'] = le.fit_transform(X['Embarked'].fillna(value=X['Embarked'].mode()[0]))
    X['Sex'] = np.where(X['Sex'] == 'male', 1, 0)

    if train:
        X = X.drop(['Survived'], axis=1)
        y = np.where(dataframe['Survived'] == 1, 'Alive', 'Dead')

## xgb_clf.py
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(max_depth=12, learning_rate=1e-4,n_estimators=500)
xgb_clf.fit(X_train, np.argmax(np.array(y_train), axis = 1))
xgb_y_pred = xgb_clf.predict(X_val)

pd.Series(xgb_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh',
                                                                               figsize = (10, 10),
                                                                              title = 'Feature importance from XGBoost').invert_yaxis();

## rf_clf.py
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators = 500, max_depth=12)
rf_clf.fit(X_train, y_train)
rf_y_pred = rf_clf.predict(X_val)

pd.Series(rf_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh',
                                                                               figsize = (10, 10),
                                                                              title = 'Feature importance from RandomForest').invert_yaxis();

## quantitative_analysis_age_embarked_pclass.py
# multivariate analysis with Embarked variable and Pclass variable
quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Embarked', hue = 'Pclass', palette=c_palette3, verbose=False, swarm=False)

## quantitative_analysis_age_survived.py
# bivariate analysis with target variable
quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Survived', palette=c_palette, verbose=False, swarm=True)

## quantitative_analysis_age.py
# univariate analysis
quantitative_summarized(dataframe= train_df, y = 'Age', palette=c_palette, verbose=False, swarm=True)

## quantitative_summarized.py
def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False):
    '''
    Helper function that gives a quick summary of quantattive data

    Arguments
    =========
    dataframe: pandas dataframe
    x: str. horizontal axis to plot the labels of categorical data (usually the target variable)
    y: str. vertical axis to plot the quantitative data
    hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable)

## categoical_analysis_gender.py
# Feature Variable: Gender
categorical_summarized(train_df, y = 'Sex', hue='Survived', palette=c_palette)
	plt.figure(figsize=(14,12))
	plt.title('Pearson Correlation of Features', size = 15)
	colormap = sns.diverging_palette(10, 220, as_cmap = True)
	sns.heatmap(corr_df.corr(),
	cmap = colormap,
	square = True,
	annot = True,
	linewidths=0.1,vmax=1.0, linecolor='white',
	annot_kws={'fontsize':12 })
	plt.show()
	from scipy.cluster.hierarchy import linkage
	from scipy.cluster.hierarchy import dendrogram
	sample_train,sample_val, gt_train, gt_val = train_test_split(train_df,
	train_df['Survived'],
	test_size=0.05,
	random_state=99)

	sample_val_processed = simple_preprocessing(sample_val, train = False)
	sample_val_processed = scaler.fit_transform(sample_val_processed)
	mergings = linkage(sample_val_processed, method='complete')
	def simple_preprocessing(dataframe, train=True):
	le = LabelEncoder()
	X = dataframe.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], axis=1)
	X['Age'] = X['Age'].fillna(value=X['Age'].mode()[0])
	X['Embarked'] = le.fit_transform(X['Embarked'].fillna(value=X['Embarked'].mode()[0]))
	X['Sex'] = np.where(X['Sex'] == 'male', 1, 0)

	if train:
	X = X.drop(['Survived'], axis=1)
	y = np.where(dataframe['Survived'] == 1, 'Alive', 'Dead')
	from xgboost import XGBClassifier

	xgb_clf = XGBClassifier(max_depth=12, learning_rate=1e-4,n_estimators=500)
	xgb_clf.fit(X_train, np.argmax(np.array(y_train), axis = 1))
	xgb_y_pred = xgb_clf.predict(X_val)

	pd.Series(xgb_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh',
	figsize = (10, 10),
	title = 'Feature importance from XGBoost').invert_yaxis();
	from sklearn.ensemble import RandomForestClassifier
	rf_clf = RandomForestClassifier(n_estimators = 500, max_depth=12)
	rf_clf.fit(X_train, y_train)
	rf_y_pred = rf_clf.predict(X_val)

	pd.Series(rf_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh',
	figsize = (10, 10),
	title = 'Feature importance from RandomForest').invert_yaxis();
	# multivariate analysis with Embarked variable and Pclass variable
	quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Embarked', hue = 'Pclass', palette=c_palette3, verbose=False, swarm=False)
	# bivariate analysis with target variable
	quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Survived', palette=c_palette, verbose=False, swarm=True)
	# univariate analysis
	quantitative_summarized(dataframe= train_df, y = 'Age', palette=c_palette, verbose=False, swarm=True)
	def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False):
	'''
	Helper function that gives a quick summary of quantattive data

	Arguments
	=========
	dataframe: pandas dataframe
	x: str. horizontal axis to plot the labels of categorical data (usually the target variable)
	y: str. vertical axis to plot the quantitative data
	hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable)
	# Feature Variable: Gender
	categorical_summarized(train_df, y = 'Sex', hue='Survived', palette=c_palette)