thistleknot/ECOD_KDE_95_pct_quotes.py

## ECOD_KDE_95_pct_quotes.py
#!/usr/bin/env python
# coding: utf-8

# In[3]:

#!pip install --upgrade numpy
get_ipython().system('pip install numpy==1.24')

# In[4]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from scipy import stats

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from pyod.models.knn import KNN   # Example: You can use K-Nearest Neighbors as an ECOD model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from pyod.models.knn import KNN
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import numpy as np
from scipy.stats import gaussian_kde
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

# In[ ]:

# In[5]:

# Load the dataset
dataset = load_dataset("Abirate/english_quotes")
# Load the dataset
quotes_lengths = [len(q) for q in dataset['train']['quote']]

# In[ ]:

# In[ ]:

# In[6]:

# Calculate the 95% percentile
percentile_95 = np.percentile(quotes_lengths, 95)

# Create a histogram and kernel density plot
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(quotes_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram of Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')

plt.subplot(1, 2, 2)
plt.hist(quotes_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram and KDE of Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')
plt.twinx()
plt.plot(np.linspace(min(quotes_lengths), max(quotes_lengths), 100),
         stats.gaussian_kde(quotes_lengths)(np.linspace(min(quotes_lengths), max(quotes_lengths), 100)),
         'r-')
plt.ylabel('Kernel Density Estimate (KDE)')
plt.tight_layout()

# Output the results
print(f"95% Percentile Quote Length: {percentile_95}")

plt.show()

# In[7]:

# Step 1: Use ECOD (K-Nearest Neighbors) for outlier detection
detector = KNN()
detector.fit(np.array(quotes_lengths).reshape(-1, 1))
outliers = detector.predict(np.array(quotes_lengths).reshape(-1, 1))

# Step 2: Exclude outliers from the dataset
non_outlier_lengths = [length for i, length in enumerate(quotes_lengths) if not outliers[i]]

# Step 3: Calculate the 95th percentile from the non-outlying data points
percentile_95 = np.percentile(non_outlier_lengths, 95)

# Create a histogram and kernel density plot for non-outlying data
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram of Non-Outlying Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')

plt.subplot(1, 2, 2)
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram and KDE of Non-Outlying Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')
plt.twinx()
plt.plot(np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100),
         stats.gaussian_kde(non_outlier_lengths)(np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100)),
         'r-')
plt.ylabel('Kernel Density Estimate (KDE)')
plt.tight_layout()

# Output the results
print(f"95% Percentile Quote Length (Excluding Outliers): {percentile_95}")

plt.show()

# In[ ]:

# In[8]:

# Assuming non_outlier_lengths is a list or numpy array of non-outlying quote lengths

# Define the transformations to test
transformations = [np.log, np.sqrt, np.cbrt]  # Add more if needed

# Define the range of bandwidths to test
bandwidths = np.linspace(0.1, 1.0, 10)  # Adjust the range and number of points as needed

# Define the ECDF function
def ecdf2(values):
    co = len(values)
    externalArray = pd.DataFrame()
    for d in range(0, len(values.columns)):
        internalArray = []
        for i in range(0, len(values.iloc[:, d])):
            a = (
                sum(values.iloc[:, d] <= values.iloc[:, d][i]) +
                sum(values.iloc[:, d] < values.iloc[:, d][i])
            ) / 2 / co
            internalArray.append(a)

        externalArray = pd.concat(
            [externalArray, pd.DataFrame(internalArray).round(2)], axis=1
        )

    return externalArray

# Define a scoring function using mean squared error (MSE)
def mse_scoring_function(kde, data):
    estimated_density = kde(data)
    true_ecdf = ecdf2(pd.DataFrame(data))  # Calculate true ECDF
    mse = mean_squared_error(true_ecdf, estimated_density)
    return mse

best_score = float('inf')
best_params = {}

# Loop over each transformation and bandwidth, calculate the KDE, and evaluate the fit
for transform in transformations:
    transformed_data = transform(non_outlier_lengths)

    for bandwidth in bandwidths:
        kde = gaussian_kde(transformed_data, bw_method=bandwidth)

        # Evaluate the KDE fit using the MSE scoring function
        score = mse_scoring_function(kde, transformed_data)

        if score < best_score:
            best_score = score
            best_params = {'transformation': transform, 'bandwidth': bandwidth}

# Once we have the best parameters, we calculate the new 95th percentile
best_transform = best_params['transformation']
best_bandwidth = best_params['bandwidth']

transformed_data = best_transform(non_outlier_lengths)
kde = gaussian_kde(transformed_data, bw_method=best_bandwidth)
percentile_95 = np.percentile(transformed_data, 95)

# Calculate the 95th percentile in the original scale
# Define a dictionary to map transformation functions to their inverses
inverse_functions = {
    np.log: np.exp,
    np.sqrt: np.square,
    np.cbrt: lambda x: np.power(x, 3),
}

# Get the best transformation function and its corresponding inverse
best_transform_function = best_params['transformation']
inverse_function = inverse_functions.get(best_transform_function)

if inverse_function is not None:
    # Calculate the 95th percentile in the original scale using the inverse function
    percentile_95_original_scale = inverse_function(percentile_95)
else:
    print("Inverse function not available for the selected transformation.")

print(f"Best Transformation: {best_transform_function.__name__}")
print(f"Best Bandwidth: {best_bandwidth}")
print(f"New 95th Percentile (Transformed Scale): {percentile_95}")
if inverse_function is not None:
    print(f"New 95th Percentile (Original Scale): {percentile_95_original_scale}")

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Assuming non_outlier_lengths is a list or numpy array of non-outlying quote lengths

# Step 2: Exclude outliers from the dataset
non_outlier_lengths = [length for i, length in enumerate(quotes_lengths) if not outliers[i]]

# Step 3: Calculate the 95th percentile from the non-outlying data points
percentile_95 = np.percentile(non_outlier_lengths, 95)

# Create a histogram and kernel density plot for non-outlying data
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram of Non-Outlying Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')

plt.subplot(1, 2, 2)
plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
plt.title('Histogram and KDE of Non-Outlying Quote Lengths')
plt.xlabel('Length of Quotes')
plt.ylabel('Probability Density')
plt.twinx()
x_range = np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100)
kde = stats.gaussian_kde(non_outlier_lengths)
plt.plot(x_range, kde(x_range), 'r-')
plt.ylabel('Kernel Density Estimate (KDE)')
plt.tight_layout()

# Output the results
print(f"95% Percentile Quote Length (Excluding Outliers): {percentile_95}")

plt.show()
	#!/usr/bin/env python
	# coding: utf-8

	# In[3]:

	#!pip install --upgrade numpy
	get_ipython().system('pip install numpy==1.24')

	# In[4]:

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from datasets import load_dataset
	from scipy import stats

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from datasets import load_dataset
	from pyod.models.knn import KNN # Example: You can use K-Nearest Neighbors as an ECOD model
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from datasets import load_dataset
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from datasets import load_dataset
	from pyod.models.knn import KNN
	from scipy import stats
	from sklearn.model_selection import GridSearchCV
	from sklearn.neighbors import KernelDensity
	import numpy as np
	from scipy.stats import gaussian_kde
	from sklearn.metrics import mean_squared_error
	from sklearn.model_selection import GridSearchCV
	from sklearn.preprocessing import FunctionTransformer
	import pandas as pd

	# In[ ]:

	# In[5]:

	# Load the dataset
	dataset = load_dataset("Abirate/english_quotes")
	# Load the dataset
	quotes_lengths = [len(q) for q in dataset['train']['quote']]

	# In[ ]:

	# In[ ]:

	# In[6]:

	# Calculate the 95% percentile
	percentile_95 = np.percentile(quotes_lengths, 95)

	# Create a histogram and kernel density plot
	plt.figure(figsize=(12, 6))
	plt.subplot(1, 2, 1)
	plt.hist(quotes_lengths, bins=30, density=True, alpha=0.6, color='b')
	plt.title('Histogram of Quote Lengths')
	plt.xlabel('Length of Quotes')
	plt.ylabel('Probability Density')

	plt.subplot(1, 2, 2)
	plt.hist(quotes_lengths, bins=30, density=True, alpha=0.6, color='b')
	plt.title('Histogram and KDE of Quote Lengths')
	plt.xlabel('Length of Quotes')
	plt.ylabel('Probability Density')
	plt.twinx()
	plt.plot(np.linspace(min(quotes_lengths), max(quotes_lengths), 100),
	stats.gaussian_kde(quotes_lengths)(np.linspace(min(quotes_lengths), max(quotes_lengths), 100)),
	'r-')
	plt.ylabel('Kernel Density Estimate (KDE)')
	plt.tight_layout()

	# Output the results
	print(f"95% Percentile Quote Length: {percentile_95}")

	plt.show()

	# In[7]:

	# Step 1: Use ECOD (K-Nearest Neighbors) for outlier detection
	detector = KNN()
	detector.fit(np.array(quotes_lengths).reshape(-1, 1))
	outliers = detector.predict(np.array(quotes_lengths).reshape(-1, 1))

	# Step 2: Exclude outliers from the dataset
	non_outlier_lengths = [length for i, length in enumerate(quotes_lengths) if not outliers[i]]

	# Step 3: Calculate the 95th percentile from the non-outlying data points
	percentile_95 = np.percentile(non_outlier_lengths, 95)

	# Create a histogram and kernel density plot for non-outlying data
	plt.figure(figsize=(12, 6))
	plt.subplot(1, 2, 1)
	plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
	plt.title('Histogram of Non-Outlying Quote Lengths')
	plt.xlabel('Length of Quotes')
	plt.ylabel('Probability Density')

	plt.subplot(1, 2, 2)
	plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
	plt.title('Histogram and KDE of Non-Outlying Quote Lengths')
	plt.xlabel('Length of Quotes')
	plt.ylabel('Probability Density')
	plt.twinx()
	plt.plot(np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100),
	stats.gaussian_kde(non_outlier_lengths)(np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100)),
	'r-')
	plt.ylabel('Kernel Density Estimate (KDE)')
	plt.tight_layout()

	# Output the results
	print(f"95% Percentile Quote Length (Excluding Outliers): {percentile_95}")

	plt.show()

	# In[ ]:

	# In[8]:

	# Assuming non_outlier_lengths is a list or numpy array of non-outlying quote lengths

	# Define the transformations to test
	transformations = [np.log, np.sqrt, np.cbrt] # Add more if needed

	# Define the range of bandwidths to test
	bandwidths = np.linspace(0.1, 1.0, 10) # Adjust the range and number of points as needed

	# Define the ECDF function
	def ecdf2(values):
	co = len(values)
	externalArray = pd.DataFrame()
	for d in range(0, len(values.columns)):
	internalArray = []
	for i in range(0, len(values.iloc[:, d])):
	a = (
	sum(values.iloc[:, d] <= values.iloc[:, d][i]) +
	sum(values.iloc[:, d] < values.iloc[:, d][i])
	) / 2 / co
	internalArray.append(a)

	externalArray = pd.concat(
	[externalArray, pd.DataFrame(internalArray).round(2)], axis=1
	)

	return externalArray

	# Define a scoring function using mean squared error (MSE)
	def mse_scoring_function(kde, data):
	estimated_density = kde(data)
	true_ecdf = ecdf2(pd.DataFrame(data)) # Calculate true ECDF
	mse = mean_squared_error(true_ecdf, estimated_density)
	return mse

	best_score = float('inf')
	best_params = {}

	# Loop over each transformation and bandwidth, calculate the KDE, and evaluate the fit
	for transform in transformations:
	transformed_data = transform(non_outlier_lengths)

	for bandwidth in bandwidths:
	kde = gaussian_kde(transformed_data, bw_method=bandwidth)

	# Evaluate the KDE fit using the MSE scoring function
	score = mse_scoring_function(kde, transformed_data)

	if score < best_score:
	best_score = score
	best_params = {'transformation': transform, 'bandwidth': bandwidth}

	# Once we have the best parameters, we calculate the new 95th percentile
	best_transform = best_params['transformation']
	best_bandwidth = best_params['bandwidth']

	transformed_data = best_transform(non_outlier_lengths)
	kde = gaussian_kde(transformed_data, bw_method=best_bandwidth)
	percentile_95 = np.percentile(transformed_data, 95)

	# Calculate the 95th percentile in the original scale
	# Define a dictionary to map transformation functions to their inverses
	inverse_functions = {
	np.log: np.exp,
	np.sqrt: np.square,
	np.cbrt: lambda x: np.power(x, 3),
	}

	# Get the best transformation function and its corresponding inverse
	best_transform_function = best_params['transformation']
	inverse_function = inverse_functions.get(best_transform_function)

	if inverse_function is not None:
	# Calculate the 95th percentile in the original scale using the inverse function
	percentile_95_original_scale = inverse_function(percentile_95)
	else:
	print("Inverse function not available for the selected transformation.")

	print(f"Best Transformation: {best_transform_function.__name__}")
	print(f"Best Bandwidth: {best_bandwidth}")
	print(f"New 95th Percentile (Transformed Scale): {percentile_95}")
	if inverse_function is not None:
	print(f"New 95th Percentile (Original Scale): {percentile_95_original_scale}")

	import numpy as np
	import matplotlib.pyplot as plt
	from scipy import stats

	# Assuming non_outlier_lengths is a list or numpy array of non-outlying quote lengths

	# Step 2: Exclude outliers from the dataset
	non_outlier_lengths = [length for i, length in enumerate(quotes_lengths) if not outliers[i]]

	# Step 3: Calculate the 95th percentile from the non-outlying data points
	percentile_95 = np.percentile(non_outlier_lengths, 95)

	# Create a histogram and kernel density plot for non-outlying data
	plt.figure(figsize=(12, 6))
	plt.subplot(1, 2, 1)
	plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
	plt.title('Histogram of Non-Outlying Quote Lengths')
	plt.xlabel('Length of Quotes')
	plt.ylabel('Probability Density')

	plt.subplot(1, 2, 2)
	plt.hist(non_outlier_lengths, bins=30, density=True, alpha=0.6, color='b')
	plt.title('Histogram and KDE of Non-Outlying Quote Lengths')
	plt.xlabel('Length of Quotes')
	plt.ylabel('Probability Density')
	plt.twinx()
	x_range = np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 100)
	kde = stats.gaussian_kde(non_outlier_lengths)
	plt.plot(x_range, kde(x_range), 'r-')
	plt.ylabel('Kernel Density Estimate (KDE)')
	plt.tight_layout()

	# Output the results
	print(f"95% Percentile Quote Length (Excluding Outliers): {percentile_95}")

	plt.show()