Rafi Atha rafiag

## 01_import.py
# Import basic libs
import pandas as pd
import numpy as np
import warnings

# Load data
df = pd.read_csv('https://raw.githubusercontent.com/rafiag/Hate-Speech-Classification/main/hate_speech_dataset.csv')

print(df.shape)
df.head()

## check_null.py
df_bank.isnull().sum()

## Multi-Linear Regression_15.py
# Plotting the residuals
plt.subplots(figsize=(8, 4))
plt.scatter(x=df_pie.index, y=df_pie.residual, alpha=0.8)
plt.plot(np.repeat(0, len(df_pie.index)+2), color='darkorange', linestyle='--')

plt.ylabel('Residual', fontsize=14)
plt.xlabel('Week', fontsize=14)
plt.title('Homescedasticity Assumption', fontsize=16)
plt.show()

## Multi-Linear Regression_14.py
from statsmodels.stats.stattools import durbin_watson

durbinWatson = durbin_watson(df_pie['residual'])

print('Durbin-Watson:', durbinWatson)
if durbinWatson < 1.5:
    print('Signs of positive autocorrelation', '\n')
    print('Assumption not satisfied')
elif durbinWatson > 2.5:
    print('Signs of negative autocorrelation', '\n')

## Multi-Linear Regression_13.py
corr = df_pie[['pie_sales', 'price', 'advertising']].corr()
print('Pearson correlation coefficient matrix of each variables:\n', corr)

# Generate a mask for the diagonal cell
mask = np.zeros_like(corr, dtype=np.bool)
np.fill_diagonal(mask, val=True)

# Initialize matplotlib figure
fig, ax = plt.subplots(figsize=(4, 3))

## Multi-Linear Regression_12.py
from statsmodels.stats.diagnostic import normal_ad

# Performing the test on the residuals
p_value = normal_ad(df_pie['residual'])[1]
print('p-value from the test Anderson-Darling test below 0.05 generally means non-normal:', p_value)

# Plotting the residuals distribution
plt.subplots(figsize=(8, 4))
plt.title('Distribution of Residuals', fontsize=18)
sns.distplot(df_pie['residual'])

## Multi-Linear Regression_11.py
# Plotting the observed vs predicted values
sns.lmplot(x='pie_sales', y='pie_sales_pred', data=df_pie, fit_reg=False, size=5)

# Plotting the diagonal line
line_coords = np.arange(df_pie[['pie_sales', 'pie_sales_pred']].min().min()-10,
                        df_pie[['pie_sales', 'pie_sales_pred']].max().max()+10)
plt.plot(line_coords, line_coords,  # X and y points
         color='darkorange', linestyle='--')

plt.ylabel('Predicted Pie Sales', fontsize=14)

## Multi-Linear Regression_10.py
df_pie['pie_sales_pred'] = olsmod.predict(X)
df_pie['residual'] = olsmod.resid
df_pie.head()

## Multi-Linear Regression_9.py
print(olsmod.pvalues)

## Multi-Linear Regression_8.py
print('F-statistic:', olsmod.fvalue)
print('Probability of observing value at least as high as F-statistic:', olsmod.f_pvalue)
	# Import basic libs
	import pandas as pd
	import numpy as np
	import warnings

	# Load data
	df = pd.read_csv('https://raw.githubusercontent.com/rafiag/Hate-Speech-Classification/main/hate_speech_dataset.csv')

	print(df.shape)
	df.head()
	# Plotting the residuals
	plt.subplots(figsize=(8, 4))
	plt.scatter(x=df_pie.index, y=df_pie.residual, alpha=0.8)
	plt.plot(np.repeat(0, len(df_pie.index)+2), color='darkorange', linestyle='--')

	plt.ylabel('Residual', fontsize=14)
	plt.xlabel('Week', fontsize=14)
	plt.title('Homescedasticity Assumption', fontsize=16)
	plt.show()
	from statsmodels.stats.stattools import durbin_watson

	durbinWatson = durbin_watson(df_pie['residual'])

	print('Durbin-Watson:', durbinWatson)
	if durbinWatson < 1.5:
	print('Signs of positive autocorrelation', '\n')
	print('Assumption not satisfied')
	elif durbinWatson > 2.5:
	print('Signs of negative autocorrelation', '\n')
	corr = df_pie[['pie_sales', 'price', 'advertising']].corr()
	print('Pearson correlation coefficient matrix of each variables:\n', corr)

	# Generate a mask for the diagonal cell
	mask = np.zeros_like(corr, dtype=np.bool)
	np.fill_diagonal(mask, val=True)

	# Initialize matplotlib figure
	fig, ax = plt.subplots(figsize=(4, 3))
	from statsmodels.stats.diagnostic import normal_ad

	# Performing the test on the residuals
	p_value = normal_ad(df_pie['residual'])[1]
	print('p-value from the test Anderson-Darling test below 0.05 generally means non-normal:', p_value)

	# Plotting the residuals distribution
	plt.subplots(figsize=(8, 4))
	plt.title('Distribution of Residuals', fontsize=18)
	sns.distplot(df_pie['residual'])
	# Plotting the observed vs predicted values
	sns.lmplot(x='pie_sales', y='pie_sales_pred', data=df_pie, fit_reg=False, size=5)

	# Plotting the diagonal line
	line_coords = np.arange(df_pie[['pie_sales', 'pie_sales_pred']].min().min()-10,
	df_pie[['pie_sales', 'pie_sales_pred']].max().max()+10)
	plt.plot(line_coords, line_coords, # X and y points
	color='darkorange', linestyle='--')

	plt.ylabel('Predicted Pie Sales', fontsize=14)
	df_pie['pie_sales_pred'] = olsmod.predict(X)
	df_pie['residual'] = olsmod.resid
	df_pie.head()
	print('F-statistic:', olsmod.fvalue)
	print('Probability of observing value at least as high as F-statistic:', olsmod.f_pvalue)