Skip to content

Instantly share code, notes, and snippets.

View rafiag's full-sized avatar

Rafi Atha rafiag

View GitHub Profile
@rafiag
rafiag / 01_import.py
Created March 13, 2021 07:52
Basic of Text Pre-Processing
# Import basic libs
import pandas as pd
import numpy as np
import warnings
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/rafiag/Hate-Speech-Classification/main/hate_speech_dataset.csv')
print(df.shape)
df.head()
@rafiag
rafiag / check_null.py
Last active May 18, 2023 03:48
Building Classification Model with Python
df_bank.isnull().sum()
# Plotting the residuals
plt.subplots(figsize=(8, 4))
plt.scatter(x=df_pie.index, y=df_pie.residual, alpha=0.8)
plt.plot(np.repeat(0, len(df_pie.index)+2), color='darkorange', linestyle='--')
plt.ylabel('Residual', fontsize=14)
plt.xlabel('Week', fontsize=14)
plt.title('Homescedasticity Assumption', fontsize=16)
plt.show()
from statsmodels.stats.stattools import durbin_watson
durbinWatson = durbin_watson(df_pie['residual'])
print('Durbin-Watson:', durbinWatson)
if durbinWatson < 1.5:
print('Signs of positive autocorrelation', '\n')
print('Assumption not satisfied')
elif durbinWatson > 2.5:
print('Signs of negative autocorrelation', '\n')
corr = df_pie[['pie_sales', 'price', 'advertising']].corr()
print('Pearson correlation coefficient matrix of each variables:\n', corr)
# Generate a mask for the diagonal cell
mask = np.zeros_like(corr, dtype=np.bool)
np.fill_diagonal(mask, val=True)
# Initialize matplotlib figure
fig, ax = plt.subplots(figsize=(4, 3))
from statsmodels.stats.diagnostic import normal_ad
# Performing the test on the residuals
p_value = normal_ad(df_pie['residual'])[1]
print('p-value from the test Anderson-Darling test below 0.05 generally means non-normal:', p_value)
# Plotting the residuals distribution
plt.subplots(figsize=(8, 4))
plt.title('Distribution of Residuals', fontsize=18)
sns.distplot(df_pie['residual'])
# Plotting the observed vs predicted values
sns.lmplot(x='pie_sales', y='pie_sales_pred', data=df_pie, fit_reg=False, size=5)
# Plotting the diagonal line
line_coords = np.arange(df_pie[['pie_sales', 'pie_sales_pred']].min().min()-10,
df_pie[['pie_sales', 'pie_sales_pred']].max().max()+10)
plt.plot(line_coords, line_coords, # X and y points
color='darkorange', linestyle='--')
plt.ylabel('Predicted Pie Sales', fontsize=14)
df_pie['pie_sales_pred'] = olsmod.predict(X)
df_pie['residual'] = olsmod.resid
df_pie.head()
print(olsmod.pvalues)
print('F-statistic:', olsmod.fvalue)
print('Probability of observing value at least as high as F-statistic:', olsmod.f_pvalue)