RealDyllon/0.0_1115_samples_intro.md

## 0.0_1115_samples_intro.md

      
    Raw
  

              0.0_1115_samples_intro.md
            
          
    CZ1115 Code Snippets and Samples

Made with ♥️ by Dyllon

  
## 1.1_imports.py
# Essential Imports
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix


## 1.2_read_data.py
# This file contains code to grab, parse and frame data

dataset = pd.read_csv('./dataest.csv') # use relative imports to seem leet


## 1.3_description.py
# This file contains some useful commands to get basic info about the dataframe

dataset.head() # see first n columns (def: 5)
dataset.info() # get column data tyype info

# Get the type and dims:
print("Data type : ", type(dataset))
print("Data dims : ", dataset.shape)

## 2.1_normal_distro.py
# This file covers how to visually compare our vars via normal distribution

# Draw the Boxplots of all variables
f = plt.figure(figsize=(16, 8))
sb.boxplot(data = dataset, orient = "h")

# Draw the histplot of all vars
f = plt.figure(figsize=(16, 8))
sb.histplot(data = dataset)

# Draw the violin plot of all vars
f = plt.figure(figsize=(16, 10))
sb.violinplot(data = dataset, orient = "h")

# Finally draw the distributions of all variables
f, axes = plt.subplots(4, 3, figsize=(18, 24))

count = 0
for var in dataset:
    sb.boxplot(data = dataset[var], orient = "h", ax = axes[count,0])
    sb.histplot(data = dataset[var], ax = axes[count,1])
    sb.violinplot(data = dataset[var], orient = "h", ax = axes[count,2])
    count += 1

## 2.2_correlation_matrix_heatmap.py
# Heatmap of the Correlation Matrix
f = plt.figure(figsize=(12, 12))
sb.heatmap(dataset.corr(), vmin = -1, vmax = 1, annot = True, fmt = ".2f")

## 3.1_uni_var_split.py
# For uni var linear regression

dataset_train, dataset_test = train_test_split(
    dataset, test_size=0.3, random_state=42) # REMEMBER TO CHANGE TEST SIZE!! Also hitchhike!

dataset_train.describe()


# Draw the distribution of Response
f, axes = plt.subplots(1, 3, figsize=(24, 8))
sb.boxplot(data = dataset_train, orient = "h", ax = axes[0])
sb.histplot(data = dataset_train, ax = axes[1])
sb.violinplot(data = dataset_train, orient = "h", ax = axes[2])

## 3.2_uni_var_linreg_create.py
linreg_height = LinearRegression() # imported from sklearn. (See above)
linreg_weight = LinearRegression()
linreg_diameter = LinearRegression()

length_train = dataset_train[["Length"]]
height_train = dataset_train[["Height"]]
weight_train = dataset_train[["Weight"]]
diameter_train = dataset_train[["Diameter"]]

length_test = dataset_test[["Length"]]
height_test = dataset_test[["Height"]]
weight_test = dataset_test[["Weight"]]
diameter_test = dataset_test[["Diameter"]]

print("~Linregs - Train")
print("Length vs Height:")
linreg_height.fit(height_train, length_train)
print('Intercept \t: b = ', linreg_height.intercept_)
print('Coefficients \t: a = ', linreg_height.coef_)
print()

print("Length vs Weight:")
linreg_weight.fit(weight_train, length_train)
print('Intercept \t: b = ', linreg_weight.intercept_)
print('Coefficients \t: a = ', linreg_weight.coef_)
print()

print("Length vs Diameter:")
linreg_diameter.fit(diameter_train, length_train)
print('Intercept \t: b = ', linreg_diameter.intercept_)
print('Coefficients \t: a = ', linreg_diameter.coef_)

## 3.3_uni_var_linreg_plot.py
plots = [ ## change the content here!
    ['height', height_train, linreg_height],
    ['weight', weight_train, linreg_weight],
    ['diameter', diameter_train, linreg_diameter]
]

target_train = length_train # target var for code readability

for plot in plots:
    var_name = plot[0]
    var_train = plot[1]
    linreg = plot[2]

    # Formula for the Regression line
    regline_x = var_train
    regline_y = linreg.intercept_ + linreg.coef_ * var_train

    # Plot the Linear Regression line
    f = plt.figure(figsize=(16, 8))
    plt.scatter(var_train, target_train)
    plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
    plt.show()

    print("Length vs Height:")
    linreg.fit(var_train, target_train)
    print('Intercept \t: b = ', linreg.intercept_)
    print('Coefficients \t: a = ', linreg.coef_)

## 3.4_uni_var_predict.py
length_train_pred_from_height = linreg_height.predict(height_train)
length_train_pred_from_weight = linreg_weight.predict(weight_train)
length_train_pred_from_diameter = linreg_diameter.predict(diameter_train)

length_test_pred_from_height = linreg_height.predict(height_test)
length_test_pred_from_weight = linreg_weight.predict(weight_test)
length_test_pred_from_diameter = linreg_diameter.predict(diameter_test)


# var_name, var_train, var_test, train_pred, test_pred
models = [
    ['height', height_train, height_test, linreg_height, length_train_pred_from_height, length_test_pred_from_height],
    ['weight', weight_train, weight_test, linreg_weight, length_train_pred_from_weight, length_test_pred_from_weight],
    ['diameter', diameter_train, diameter_test, linreg_diameter, length_train_pred_from_diameter, length_test_pred_from_diameter],

]

for model in models:
    var_name = model[0]
    var_train = model[1]
    var_test = model[2]
    linreg = model[3]
    train_pred = model[4]
    test_pred = model[5]

    print()
    print("~~ Length vs " + var_name + " TRAIN ~~")

    # Plot the Linear Regression line based on Var Train
    f = plt.figure(figsize=(16, 8))
    plt.scatter(var_train, length_train)
    plt.scatter(var_train, train_pred, color = "r")
    plt.show()

    # Explained Variance (R^2)
    print("Explained Variance (R^2) \t:", linreg.score(var_train, length_train))

    # Mean Squared Error (MSE)
    mse = mean_squared_error(length_train, train_pred)
    print("Mean Squared Error (MSE) \t:", mse)
    print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mse))

    print()
    print("~~ Length vs " + var_name + " TEST ~~")

    # Plot the Linear Regression line based on Var Test
    f = plt.figure(figsize=(16, 8))
    plt.scatter(var_test, length_test)
    plt.scatter(var_test, test_pred, color = "r")
    plt.show()

    # Explained Variance (R^2)
    print("Explained Variance (R^2) \t:", linreg.score(var_test, length_test))

    # Mean Squared Error (MSE)
    mse = mean_squared_error(length_test, test_pred)
    print("Mean Squared Error (MSE) \t:", mse)
    print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mse))

## 4.1_multi_var_split_and_fit.py
# Extract Response and Predictors
y = pd.DataFrame(dataset['Length'])
X = pd.DataFrame(dataset[['Height','Weight','Diameter']]) # our single multi-variate model data!

# Split the Dataset into random Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

# Check the sample sizes
print("Train Set :", X_train.shape, y_train.shape)
print("Test Set  :", X_test.shape, y_test.shape)


# Create a Linear Regression object
linreg = LinearRegression()

# Train the Linear Regression model
linreg.fit(X_train, y_train)

# linreg coeffs
print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

## 4.2_multi_var_linreg_predict.py
# Predict SalePrice values corresponding to Predictors
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'w-', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'w-', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()

## 4.3_multi_var_r2_mse.py
print("Explained Variance (R^2) on Train Set \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) on Train Set \t:", mean_squared_error(y_train, y_train_pred))
print("Mean Squared Error (MSE) on Test Set \t:", mean_squared_error(y_test, y_test_pred))

## 5.1_visualizations.py
# Check the `catplot` for `target`, to visually understand the distribution.
sb.catplot(y = 'Target', data = dataset, kind = "count")

# Print the ratio `Y` : `N` for `Target` to check the imbalance in the classes.
countY, countX = dataset.Target.value_counts()
print("Ratio of classes is Y : N = ", countY, ":", countX)

# visualize their mutual relationship. (Boxplot)
f = plt.figure(figsize=(16, 8))
sb.boxplot(x = 'Predictor', y = 'Target', data = dataset)

# swarmplot to see where to make the cut-throughs
f = plt.figure(figsize=(16, 8))
sb.swarmplot(x = 'Predictor', y = 'Target', data = dataset)

# instead of swarmplot we can stripplot
f = plt.figure(figsize=(16, 8))
sb.stripplot(x = 'SalePrice', y = 'CentralAir', data = houseData)


## 5.2_dectree_classifier.py
dectree = DecisionTreeClassifier(max_depth = 2) # change max-depth as you fancy

# Extract Response and Predictors
y = pd.DataFrame(houseData['Target'])
X = pd.DataFrame(houseData['Predictor'])

# Split the Dataset into random Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # or whatever percent / value is required

# Check the sample sizes
print("Train Set :", X_train.shape, y_train.shape)
print("Test Set  :", X_test.shape, y_test.shape)

# Fit the tree
dectree.fit(X_train, y_train)


## 5.3_gini_vis_rep_of_dectree.py
# Plot the trained Decision Tree
from sklearn.tree import plot_tree

f = plt.figure(figsize=(12,12))
plot_tree(dectree, filled=True, rounded=True,
          feature_names=X_train.columns,
          class_names=["N","Y"])

## 5.4_goodness_of_fix_confusion_matrix.py
# Predict the Response corresponding to Predictors
y_train_pred = dectree.predict(X_train)

# Plot the two-way Confusion Matrix
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18})

# Print the Classification Accuracy
print("Test Data")
print("Accuracy  :\t", dectree.score(X_test, y_test))
print()

# Print the Accuracy Measures from the Confusion Matrix
cmTest = confusion_matrix(y_test, y_test_pred)
tpTest = cmTest[1][1] # True Positives : Y (1) predicted Y (1)
fpTest = cmTest[0][1] # False Positives : N (0) predicted Y (1)
tnTest = cmTest[0][0] # True Negatives : N (0) predicted N (0)
fnTest = cmTest[1][0] # False Negatives : Y (1) predicted N (0)

print("TPR Test :\t", (tpTest/(tpTest + fnTest)))
print("TNR Test :\t", (tnTest/(tnTest + fpTest)))
print()

print("FPR Test :\t", (fpTest/(fpTest + tnTest)))
print("FNR Test :\t", (fnTest/(fnTest + tpTest)))

## 5.5_conculsion_from_lab.md

      
    Raw
  

              5.5_conculsion_from_lab.md
            
          
    HEADS UP!! THIS IS COPY AND PASTE FROM THE LAB!

Problem 3 : Comparing the Uni-Variate Decision Tree Models
Compare and contrast the four models in terms of Classification Accuracy, TPR and FPR on both Train and Test Data.
CentralAir vs SalePrice has the highest Training Accuracy out of the four models.
CentralAir vs GrLivArea has the highest Test Accuracy out of the four models.
However, the train and test accuracy for all four models are pretty high and quite close.
So, it is not easy to justify which model is better just using their classification accuracy.
However, if we look at the True Positive Rate (TPR) and False Positive Rate (FPR) of the four models, we find that
YearBuilt yields a TPR of 1 (best-case) but an FPR of 1 (worst-case) on both Train and Test data. Really bad for prediction.
GrLivArea yields a TPR of close to 1 (best-case) but an FPR of close to 1 (worst-case) on Train and Test set, not good either.
SalePrice and OverallQual yield the best TPR (high) vs FPR (not-as-high) trade-off in case of both Train and Test data.
Overall, the predictor OverallQual is the best amongst the four in predicting CentralAir, while SalePrice is a close second as per the models above. YearBuilt is definitely the worst predictor out of these four variables, with GrLivArea not doing so well either, given the models above.
Did you notice? : Go back and check again all accuracy figures for the four models. I am pretty sure you did not get the exact same values as I did. This is due to the random selection of Train-Test sets. In fact, if you run the above cells again, you will get a different set of accuracy figures. If that is so, can we really be confident that OverallQual will always be the best variable to predict CentralAir? Think about it. ;-)
	# Essential Imports
	import numpy as np
	import pandas as pd
	import seaborn as sb
	import matplotlib.pyplot as plt # we only need pyplot
	sb.set() # set the default Seaborn style for graphics

	# sklearn imports
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.metrics import confusion_matrix
	# This file contains code to grab, parse and frame data

	dataset = pd.read_csv('./dataest.csv') # use relative imports to seem leet
	# This file contains some useful commands to get basic info about the dataframe

	dataset.head() # see first n columns (def: 5)
	dataset.info() # get column data tyype info

	# Get the type and dims:
	print("Data type : ", type(dataset))
	print("Data dims : ", dataset.shape)
	# This file covers how to visually compare our vars via normal distribution

	# Draw the Boxplots of all variables
	f = plt.figure(figsize=(16, 8))
	sb.boxplot(data = dataset, orient = "h")

	# Draw the histplot of all vars
	f = plt.figure(figsize=(16, 8))
	sb.histplot(data = dataset)

	# Draw the violin plot of all vars
	f = plt.figure(figsize=(16, 10))
	sb.violinplot(data = dataset, orient = "h")

	# Finally draw the distributions of all variables
	f, axes = plt.subplots(4, 3, figsize=(18, 24))

	count = 0
	for var in dataset:
	sb.boxplot(data = dataset[var], orient = "h", ax = axes[count,0])
	sb.histplot(data = dataset[var], ax = axes[count,1])
	sb.violinplot(data = dataset[var], orient = "h", ax = axes[count,2])
	count += 1
	# Heatmap of the Correlation Matrix
	f = plt.figure(figsize=(12, 12))
	sb.heatmap(dataset.corr(), vmin = -1, vmax = 1, annot = True, fmt = ".2f")
	# For uni var linear regression

	dataset_train, dataset_test = train_test_split(
	dataset, test_size=0.3, random_state=42) # REMEMBER TO CHANGE TEST SIZE!! Also hitchhike!

	dataset_train.describe()


	# Draw the distribution of Response
	f, axes = plt.subplots(1, 3, figsize=(24, 8))
	sb.boxplot(data = dataset_train, orient = "h", ax = axes[0])
	sb.histplot(data = dataset_train, ax = axes[1])
	sb.violinplot(data = dataset_train, orient = "h", ax = axes[2])
	linreg_height = LinearRegression() # imported from sklearn. (See above)
	linreg_weight = LinearRegression()
	linreg_diameter = LinearRegression()

	length_train = dataset_train[["Length"]]
	height_train = dataset_train[["Height"]]
	weight_train = dataset_train[["Weight"]]
	diameter_train = dataset_train[["Diameter"]]

	length_test = dataset_test[["Length"]]
	height_test = dataset_test[["Height"]]
	weight_test = dataset_test[["Weight"]]
	diameter_test = dataset_test[["Diameter"]]

	print("~Linregs - Train")
	print("Length vs Height:")
	linreg_height.fit(height_train, length_train)
	print('Intercept \t: b = ', linreg_height.intercept_)
	print('Coefficients \t: a = ', linreg_height.coef_)
	print()

	print("Length vs Weight:")
	linreg_weight.fit(weight_train, length_train)
	print('Intercept \t: b = ', linreg_weight.intercept_)
	print('Coefficients \t: a = ', linreg_weight.coef_)
	print()

	print("Length vs Diameter:")
	linreg_diameter.fit(diameter_train, length_train)
	print('Intercept \t: b = ', linreg_diameter.intercept_)
	print('Coefficients \t: a = ', linreg_diameter.coef_)
	plots = [ ## change the content here!
	['height', height_train, linreg_height],
	['weight', weight_train, linreg_weight],
	['diameter', diameter_train, linreg_diameter]
	]

	target_train = length_train # target var for code readability

	for plot in plots:
	var_name = plot[0]
	var_train = plot[1]
	linreg = plot[2]

	# Formula for the Regression line
	regline_x = var_train
	regline_y = linreg.intercept_ + linreg.coef_ * var_train

	# Plot the Linear Regression line
	f = plt.figure(figsize=(16, 8))
	plt.scatter(var_train, target_train)
	plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
	plt.show()

	print("Length vs Height:")
	linreg.fit(var_train, target_train)
	print('Intercept \t: b = ', linreg.intercept_)
	print('Coefficients \t: a = ', linreg.coef_)
	length_train_pred_from_height = linreg_height.predict(height_train)
	length_train_pred_from_weight = linreg_weight.predict(weight_train)
	length_train_pred_from_diameter = linreg_diameter.predict(diameter_train)

	length_test_pred_from_height = linreg_height.predict(height_test)
	length_test_pred_from_weight = linreg_weight.predict(weight_test)
	length_test_pred_from_diameter = linreg_diameter.predict(diameter_test)


	# var_name, var_train, var_test, train_pred, test_pred
	models = [
	['height', height_train, height_test, linreg_height, length_train_pred_from_height, length_test_pred_from_height],
	['weight', weight_train, weight_test, linreg_weight, length_train_pred_from_weight, length_test_pred_from_weight],
	['diameter', diameter_train, diameter_test, linreg_diameter, length_train_pred_from_diameter, length_test_pred_from_diameter],

	]

	for model in models:
	var_name = model[0]
	var_train = model[1]
	var_test = model[2]
	linreg = model[3]
	train_pred = model[4]
	test_pred = model[5]

	print()
	print("~~ Length vs " + var_name + " TRAIN ~~")

	# Plot the Linear Regression line based on Var Train
	f = plt.figure(figsize=(16, 8))
	plt.scatter(var_train, length_train)
	plt.scatter(var_train, train_pred, color = "r")
	plt.show()

	# Explained Variance (R^2)
	print("Explained Variance (R^2) \t:", linreg.score(var_train, length_train))

	# Mean Squared Error (MSE)
	mse = mean_squared_error(length_train, train_pred)
	print("Mean Squared Error (MSE) \t:", mse)
	print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mse))

	print()
	print("~~ Length vs " + var_name + " TEST ~~")

	# Plot the Linear Regression line based on Var Test
	f = plt.figure(figsize=(16, 8))
	plt.scatter(var_test, length_test)
	plt.scatter(var_test, test_pred, color = "r")
	plt.show()

	# Explained Variance (R^2)
	print("Explained Variance (R^2) \t:", linreg.score(var_test, length_test))

	# Mean Squared Error (MSE)
	mse = mean_squared_error(length_test, test_pred)
	print("Mean Squared Error (MSE) \t:", mse)
	print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mse))
	# Extract Response and Predictors
	y = pd.DataFrame(dataset['Length'])
	X = pd.DataFrame(dataset[['Height','Weight','Diameter']]) # our single multi-variate model data!

	# Split the Dataset into random Train and Test
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

	# Check the sample sizes
	print("Train Set :", X_train.shape, y_train.shape)
	print("Test Set :", X_test.shape, y_test.shape)


	# Create a Linear Regression object
	linreg = LinearRegression()

	# Train the Linear Regression model
	linreg.fit(X_train, y_train)

	# linreg coeffs
	print('Intercept \t: b = ', linreg.intercept_)
	print('Coefficients \t: a = ', linreg.coef_)