Created
January 6, 2024 01:19
-
-
Save thistleknot/5a07de2c8e05fdd33780fb0b7db045b6 to your computer and use it in GitHub Desktop.
ECOD pruned 95% of records using KDE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[3]: | |
#!pip install --upgrade numpy | |
get_ipython().system('pip install numpy==1.24') | |
# In[4]: | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from datasets import load_dataset | |
from scipy import stats | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from datasets import load_dataset | |
from pyod.models.knn import KNN # Example: You can use K-Nearest Neighbors as an ECOD model | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from datasets import load_dataset | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from datasets import load_dataset | |
from pyod.models.knn import KNN | |
from scipy import stats | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.neighbors import KernelDensity | |
import numpy as np | |
from scipy.stats import gaussian_kde | |
from sklearn.metrics import mean_squared_error | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.preprocessing import FunctionTransformer | |
import pandas as pd | |
# In[ ]: | |
# In[5]: | |
# Load the dataset | |
dataset = load_dataset("Abirate/english_quotes") | |
# Load the dataset | |
quotes_lengths = [len(q) for q in dataset['train']['quote']] | |
# In[ ]: | |
# In[ ]: | |
# In[6]: | |
# Calculate the 95% percentile | |
percentile_95 = np.percentile(quotes_lengths, 95) | |
# Create a histogram and kernel density plot | |
plt.figure(figsize=(12, 6)) | |
plt.subplot(1, 2, 1) | |
plt.hist(quotes_lengths, bins=30, density=True, alpha=0.6, color='b') | |
plt.title('Histogram of Quote Lengths') | |
plt.xlabel('Length of Quotes') | |
plt.ylabel('Probability Density') | |
plt.subplot(1, 2, 2) | |
plt.hist(quotes_lengths, bins=30, density=True, alpha=0.6, color='b') | |
plt.title('Histogram and KDE of Quote Lengths') | |
plt.xlabel('Length of Quotes') | |
plt.ylabel('Probability Density') | |
plt.twinx() | |
plt.plot(np.linspace(min(quotes_lengths), max(quotes_lengths), 100), | |
stats.gaussian_kde(quotes_lengths)(np.linspace(min(quotes_lengths), max(quotes_lengths), 100)), | |
'r-') | |
plt.ylabel('Kernel Density Estimate (KDE)') | |
plt.tight_layout() | |
# Output the results | |
print(f"95% Percentile Quote Length: {percentile_95}") | |
plt.show() | |
# In[7]: | |
# Step 1: Use ECOD (K-Nearest Neighbors) for outlier detection | |
detector = KNN() | |
detector.fit(np.array(quotes_lengths).reshape(-1, 1)) | |
outliers = detector.predict(np.array(quotes_lengths).reshape(-1, 1)) | |
# Step 2: Exclude outliers from the dataset | |
non_outlier_lengths = [length for i, length in enumerate(quotes_lengths) if not outliers[i]] | |
# Step 3: Calculate the 95th percentile from the non-outlying data points | |
percentile_95 = np.percentile(non_outlier_lengths, 95) | |
# Create a histogram and kernel density plot for non-outlying data | |
plt.figure(figsize=(12, 6)) | |
plt.subplot(1, 2, 1) | |
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b') | |
plt.title('Histogram of Non-Outlying Quote Lengths') | |
plt.xlabel('Length of Quotes') | |
plt.ylabel('Probability Density') | |
plt.subplot(1, 2, 2) | |
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b') | |
plt.title('Histogram and KDE of Non-Outlying Quote Lengths') | |
plt.xlabel('Length of Quotes') | |
plt.ylabel('Probability Density') | |
plt.twinx() | |
plt.plot(np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100), | |
stats.gaussian_kde(non_outlier_lengths)(np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100)), | |
'r-') | |
plt.ylabel('Kernel Density Estimate (KDE)') | |
plt.tight_layout() | |
# Output the results | |
print(f"95% Percentile Quote Length (Excluding Outliers): {percentile_95}") | |
plt.show() | |
# In[ ]: | |
# In[8]: | |
# Assuming non_outlier_lengths is a list or numpy array of non-outlying quote lengths | |
# Define the transformations to test | |
transformations = [np.log, np.sqrt, np.cbrt] # Add more if needed | |
# Define the range of bandwidths to test | |
bandwidths = np.linspace(0.1, 1.0, 10) # Adjust the range and number of points as needed | |
# Define the ECDF function | |
def ecdf2(values): | |
co = len(values) | |
externalArray = pd.DataFrame() | |
for d in range(0, len(values.columns)): | |
internalArray = [] | |
for i in range(0, len(values.iloc[:, d])): | |
a = ( | |
sum(values.iloc[:, d] <= values.iloc[:, d][i]) + | |
sum(values.iloc[:, d] < values.iloc[:, d][i]) | |
) / 2 / co | |
internalArray.append(a) | |
externalArray = pd.concat( | |
[externalArray, pd.DataFrame(internalArray).round(2)], axis=1 | |
) | |
return externalArray | |
# Define a scoring function using mean squared error (MSE) | |
def mse_scoring_function(kde, data): | |
estimated_density = kde(data) | |
true_ecdf = ecdf2(pd.DataFrame(data)) # Calculate true ECDF | |
mse = mean_squared_error(true_ecdf, estimated_density) | |
return mse | |
best_score = float('inf') | |
best_params = {} | |
# Loop over each transformation and bandwidth, calculate the KDE, and evaluate the fit | |
for transform in transformations: | |
transformed_data = transform(non_outlier_lengths) | |
for bandwidth in bandwidths: | |
kde = gaussian_kde(transformed_data, bw_method=bandwidth) | |
# Evaluate the KDE fit using the MSE scoring function | |
score = mse_scoring_function(kde, transformed_data) | |
if score < best_score: | |
best_score = score | |
best_params = {'transformation': transform, 'bandwidth': bandwidth} | |
# Once we have the best parameters, we calculate the new 95th percentile | |
best_transform = best_params['transformation'] | |
best_bandwidth = best_params['bandwidth'] | |
transformed_data = best_transform(non_outlier_lengths) | |
kde = gaussian_kde(transformed_data, bw_method=best_bandwidth) | |
percentile_95 = np.percentile(transformed_data, 95) | |
# Calculate the 95th percentile in the original scale | |
# Define a dictionary to map transformation functions to their inverses | |
inverse_functions = { | |
np.log: np.exp, | |
np.sqrt: np.square, | |
np.cbrt: lambda x: np.power(x, 3), | |
} | |
# Get the best transformation function and its corresponding inverse | |
best_transform_function = best_params['transformation'] | |
inverse_function = inverse_functions.get(best_transform_function) | |
if inverse_function is not None: | |
# Calculate the 95th percentile in the original scale using the inverse function | |
percentile_95_original_scale = inverse_function(percentile_95) | |
else: | |
print("Inverse function not available for the selected transformation.") | |
print(f"Best Transformation: {best_transform_function.__name__}") | |
print(f"Best Bandwidth: {best_bandwidth}") | |
print(f"New 95th Percentile (Transformed Scale): {percentile_95}") | |
if inverse_function is not None: | |
print(f"New 95th Percentile (Original Scale): {percentile_95_original_scale}") | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from scipy import stats | |
# Assuming non_outlier_lengths is a list or numpy array of non-outlying quote lengths | |
# Step 2: Exclude outliers from the dataset | |
non_outlier_lengths = [length for i, length in enumerate(quotes_lengths) if not outliers[i]] | |
# Step 3: Calculate the 95th percentile from the non-outlying data points | |
percentile_95 = np.percentile(non_outlier_lengths, 95) | |
# Create a histogram and kernel density plot for non-outlying data | |
plt.figure(figsize=(12, 6)) | |
plt.subplot(1, 2, 1) | |
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b') | |
plt.title('Histogram of Non-Outlying Quote Lengths') | |
plt.xlabel('Length of Quotes') | |
plt.ylabel('Probability Density') | |
plt.subplot(1, 2, 2) | |
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b') | |
plt.title('Histogram and KDE of Non-Outlying Quote Lengths') | |
plt.xlabel('Length of Quotes') | |
plt.ylabel('Probability Density') | |
plt.twinx() | |
x_range = np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100) | |
kde = stats.gaussian_kde(non_outlier_lengths) | |
plt.plot(x_range, kde(x_range), 'r-') | |
plt.ylabel('Kernel Density Estimate (KDE)') | |
plt.tight_layout() | |
# Output the results | |
print(f"95% Percentile Quote Length (Excluding Outliers): {percentile_95}") | |
plt.show() |
Author
thistleknot
commented
Jan 6, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment