Skip to content

Instantly share code, notes, and snippets.

@thistleknot
Created January 6, 2024 01:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thistleknot/5a07de2c8e05fdd33780fb0b7db045b6 to your computer and use it in GitHub Desktop.
Save thistleknot/5a07de2c8e05fdd33780fb0b7db045b6 to your computer and use it in GitHub Desktop.
ECOD pruned 95% of records using KDE
#!/usr/bin/env python
# coding: utf-8
# In[3]:
#!pip install --upgrade numpy
get_ipython().system('pip install numpy==1.24')
# In[4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from scipy import stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from pyod.models.knn import KNN # Example: You can use K-Nearest Neighbors as an ECOD model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from pyod.models.knn import KNN
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import numpy as np
from scipy.stats import gaussian_kde
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
# In[ ]:
# In[5]:
# Load the dataset
dataset = load_dataset("Abirate/english_quotes")
# Load the dataset
quotes_lengths = [len(q) for q in dataset['train']['quote']]
# In[ ]:
# In[ ]:
# In[6]:
# Calculate the 95% percentile
percentile_95 = np.percentile(quotes_lengths, 95)
# Create a histogram and kernel density plot
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(quotes_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram of Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')
plt.subplot(1, 2, 2)
plt.hist(quotes_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram and KDE of Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')
plt.twinx()
plt.plot(np.linspace(min(quotes_lengths), max(quotes_lengths), 100),
stats.gaussian_kde(quotes_lengths)(np.linspace(min(quotes_lengths), max(quotes_lengths), 100)),
'r-')
plt.ylabel('Kernel Density Estimate (KDE)')
plt.tight_layout()
# Output the results
print(f"95% Percentile Quote Length: {percentile_95}")
plt.show()
# In[7]:
# Step 1: Use ECOD (K-Nearest Neighbors) for outlier detection
detector = KNN()
detector.fit(np.array(quotes_lengths).reshape(-1, 1))
outliers = detector.predict(np.array(quotes_lengths).reshape(-1, 1))
# Step 2: Exclude outliers from the dataset
non_outlier_lengths = [length for i, length in enumerate(quotes_lengths) if not outliers[i]]
# Step 3: Calculate the 95th percentile from the non-outlying data points
percentile_95 = np.percentile(non_outlier_lengths, 95)
# Create a histogram and kernel density plot for non-outlying data
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram of Non-Outlying Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')
plt.subplot(1, 2, 2)
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram and KDE of Non-Outlying Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')
plt.twinx()
plt.plot(np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100),
stats.gaussian_kde(non_outlier_lengths)(np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100)),
'r-')
plt.ylabel('Kernel Density Estimate (KDE)')
plt.tight_layout()
# Output the results
print(f"95% Percentile Quote Length (Excluding Outliers): {percentile_95}")
plt.show()
# In[ ]:
# In[8]:
# Assuming non_outlier_lengths is a list or numpy array of non-outlying quote lengths
# Define the transformations to test
transformations = [np.log, np.sqrt, np.cbrt] # Add more if needed
# Define the range of bandwidths to test
bandwidths = np.linspace(0.1, 1.0, 10) # Adjust the range and number of points as needed
# Define the ECDF function
def ecdf2(values):
co = len(values)
externalArray = pd.DataFrame()
for d in range(0, len(values.columns)):
internalArray = []
for i in range(0, len(values.iloc[:, d])):
a = (
sum(values.iloc[:, d] <= values.iloc[:, d][i]) +
sum(values.iloc[:, d] < values.iloc[:, d][i])
) / 2 / co
internalArray.append(a)
externalArray = pd.concat(
[externalArray, pd.DataFrame(internalArray).round(2)], axis=1
)
return externalArray
# Define a scoring function using mean squared error (MSE)
def mse_scoring_function(kde, data):
estimated_density = kde(data)
true_ecdf = ecdf2(pd.DataFrame(data)) # Calculate true ECDF
mse = mean_squared_error(true_ecdf, estimated_density)
return mse
best_score = float('inf')
best_params = {}
# Loop over each transformation and bandwidth, calculate the KDE, and evaluate the fit
for transform in transformations:
transformed_data = transform(non_outlier_lengths)
for bandwidth in bandwidths:
kde = gaussian_kde(transformed_data, bw_method=bandwidth)
# Evaluate the KDE fit using the MSE scoring function
score = mse_scoring_function(kde, transformed_data)
if score < best_score:
best_score = score
best_params = {'transformation': transform, 'bandwidth': bandwidth}
# Once we have the best parameters, we calculate the new 95th percentile
best_transform = best_params['transformation']
best_bandwidth = best_params['bandwidth']
transformed_data = best_transform(non_outlier_lengths)
kde = gaussian_kde(transformed_data, bw_method=best_bandwidth)
percentile_95 = np.percentile(transformed_data, 95)
# Calculate the 95th percentile in the original scale
# Define a dictionary to map transformation functions to their inverses
inverse_functions = {
np.log: np.exp,
np.sqrt: np.square,
np.cbrt: lambda x: np.power(x, 3),
}
# Get the best transformation function and its corresponding inverse
best_transform_function = best_params['transformation']
inverse_function = inverse_functions.get(best_transform_function)
if inverse_function is not None:
# Calculate the 95th percentile in the original scale using the inverse function
percentile_95_original_scale = inverse_function(percentile_95)
else:
print("Inverse function not available for the selected transformation.")
print(f"Best Transformation: {best_transform_function.__name__}")
print(f"Best Bandwidth: {best_bandwidth}")
print(f"New 95th Percentile (Transformed Scale): {percentile_95}")
if inverse_function is not None:
print(f"New 95th Percentile (Original Scale): {percentile_95_original_scale}")
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# Assuming non_outlier_lengths is a list or numpy array of non-outlying quote lengths
# Step 2: Exclude outliers from the dataset
non_outlier_lengths = [length for i, length in enumerate(quotes_lengths) if not outliers[i]]
# Step 3: Calculate the 95th percentile from the non-outlying data points
percentile_95 = np.percentile(non_outlier_lengths, 95)
# Create a histogram and kernel density plot for non-outlying data
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram of Non-Outlying Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')
plt.subplot(1, 2, 2)
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram and KDE of Non-Outlying Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')
plt.twinx()
x_range = np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100)
kde = stats.gaussian_kde(non_outlier_lengths)
plt.plot(x_range, kde(x_range), 'r-')
plt.ylabel('Kernel Density Estimate (KDE)')
plt.tight_layout()
# Output the results
print(f"95% Percentile Quote Length (Excluding Outliers): {percentile_95}")
plt.show()
@thistleknot
Copy link
Author

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment