Skip to content

Instantly share code, notes, and snippets.

@thistleknot
Last active January 13, 2024 01:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thistleknot/1d25d5b65f3d7255ddc8ffbd3981a0bb to your computer and use it in GitHub Desktop.
Save thistleknot/1d25d5b65f3d7255ddc8ffbd3981a0bb to your computer and use it in GitHub Desktop.
Optimal Cut
#!pip install --upgrade numpy
!pip install numpy==1.24
from datasets import load_dataset
from pyod.models.knn import KNN
from pyod.models.knn import KNN # Example: You can use K-Nearest Neighbors as an ECOD model
from scipy import stats
from scipy.interpolate import UnivariateSpline
from scipy.stats import gaussian_kde
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from sklearn.preprocessing import FunctionTransformer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
import json
import random
from scipy.integrate import simps
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from textblob import TextBlob
import nltk
from nltk.corpus import brown
from sklearn.model_selection import train_test_split
#model.fit(test, metric='euclidean')
# Function to tokenize and filter
def tokenize_and_filter(dataset, min_size, max_size):
tokenized = tokenizer.batch_encode_plus(dataset)['input_ids']
filtered = [tokens for tokens in tokenized if min_size <= len(tokens) <= max_size]
return filtered
# Define the ECDF function
def ecdf2(values):
co = len(values)
externalArray = pd.DataFrame()
for d in range(0, len(values.columns)):
internalArray = []
for i in range(0, len(values.iloc[:, d])):
a = (
sum(values.iloc[:, d] <= values.iloc[:, d][i]) +
sum(values.iloc[:, d] < values.iloc[:, d][i])
) / 2 / co
internalArray.append(a)
externalArray = pd.concat(
[externalArray, pd.DataFrame(internalArray).round(2)], axis=1
)
return externalArray
# Define a scoring function using mean squared error (MSE)
def mse_scoring_function(kde, data):
estimated_density = kde(data)
true_ecdf = ecdf2(pd.DataFrame(data)) # Calculate true ECDF
mse = mean_squared_error(true_ecdf, estimated_density)
return mse
# KDE Optimization
def kde_mse(x, kde):
pdf = kde.score_samples(x.reshape(-1, 1))
mse = mean_squared_error(np.zeros_like(x), pdf)
return mse
def sentence_ngrams(text, n):
blob = TextBlob(text)
sentences = [str(sentence) for sentence in blob.sentences]
return [sentences[i:i+n] for i in range(len(sentences)-n+1)]
def create_batches(records, block_size, num_batches, eos_token_id):
random.shuffle(records)
# Adding eos_token_id to each record and then checking if it fits in the block
available_records = [[i, record + [eos_token_id]] for i, record in enumerate(records) if len(record) + 1 <= block_size]
def fill_sequence(sequence, available_records, space_avail):
if not available_records or space_avail <= 0:
return sequence, available_records, space_avail
records_to_remove = []
for idx, record in available_records:
record_length = len(record) # record already includes the eos_token_id
if record_length <= space_avail:
sequence.append(record)
space_avail -= record_length
records_to_remove.append(idx)
if space_avail <= 0:
break
# If space is still available, try swapping out the first record
if space_avail > 0 and len(sequence) > 0:
removed_record = sequence.pop(0)
available_records.append((None, removed_record))
space_avail += len(removed_record)
available_records = [item for item in available_records if item[0] not in records_to_remove]
return fill_sequence(sequence, available_records, space_avail)
else:
available_records = [item for item in available_records if item[0] not in records_to_remove]
return sequence, available_records, space_avail
sequences = []
space_avail = block_size
while available_records and len(sequences) < num_batches:
sequence, available_records, space_avail = fill_sequence([], available_records, space_avail)
sequences.append(sequence)
space_avail = block_size
return sequences, len(available_records), [np.sum([len(t) for t in t_]) for t_ in sequences]
# Load the dataset
dataset = load_dataset("Abirate/english_quotes")
quotes = dataset['train']['quote']
quotes = pd.DataFrame([q.replace('”','').replace('“','') for q in quotes]).dropna().values
# Initialize data with quotes
data = list(quotes)
# Read and convert data from 'graciousquotes.csv' into strings
gracious_quotes = pd.read_csv('graciousquotes.csv', index_col=0).dropna()['0'].astype(str).tolist()
data.extend(gracious_quotes)
# Read and convert data from 'az_quotes.csv' into strings
az_quotes = pd.read_csv('az_quotes.csv', index_col=0).dropna()['0'].astype(str).tolist()
data.extend(az_quotes)
# Randomly sample 1000 items from the combined list
#data = random.sample(data, 1000)
# Remove None values and empty strings from the list
data = [item for item in data if item is not None and item != ""]
# Convert all elements to strings (if they are not already)
data = [str(item) for item in data]
data_ = random.sample(data,1000)
exs = load_dataset("marksverdhei/wordnet-definitions-en-2021")['train']['Example']
# Load the dataset
defs = load_dataset("marksverdhei/wordnet-definitions-en-2021")['train']['Definition']
# List to hold the titles
idioms = []
json_file_path = 'extracted_idioms.json'
# Read and parse the JSON file
with open(json_file_path, 'r') as file:
for line in file:
json_obj = json.loads(line)
idiom = json_obj['title']
idioms.append(idiom)
if(True):
nltk.download("brown")
brown_ = [" ".join(brown.words(fileid)) for fileid in brown.fileids()]
wiki = load_dataset("EleutherAI/wikitext_document_level",'wikitext-103-v1')['train']['page']
#wiki = random.sample(dataset['train']['page'],1000)
essays = load_dataset("qwedsacf/ivypanda-essays")['train']['TEXT']
pre_data = wiki
og_len = len(pre_data.copy())
pre_data = random.sample(pre_data,1000)
#data = []
#for b in pre_data:
#print(b)
#blob = TextBlob(b)
#blob.tags # [('The', 'DT'), ('titular', 'JJ'),
# ('threat', 'NN'), ('of', 'IN'), ...]
#blob.noun_phrases # WordList(['titular threat', 'blob',
# 'ultimate movie monster',
# 'amoeba-like mass', ...])
#for sentence in blob.sentences:
#data.append(str(sentence))
data = []
# Generate sentence bigrams (n=2)
for text in pre_data:
sentence_bigrams = sentence_ngrams(text, 2)
# Display the sentence bigrams
for bigram in sentence_bigrams:
data.append(" ".join(bigram))
og_len = len(data.copy())
data = random.sample(data, 1000)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token
# Get the vocabulary size
vocab_size = len(tokenizer)
print("Vocabulary Size:", vocab_size)
#tokenized = tokenizer.batch_encode_plus(data)
#tokenized = tokenizer.batch_encode_plus(data)['input_ids']
tokenized = tokenizer.batch_encode_plus([t for t in data])['input_ids']
#data_lengths = [len(t) for t in tokenized['input_ids']]
data_lengths = [len(t) for t in tokenized]
len(data)
len(tokenized)
train_tokenized, val_tokenized = train_test_split(tokenized, train_size=.9)
test, _, s = create_batches(val_tokenized, 92, 24, tokenizer.eos_token_id)
print(og_len)
# Outlier detection
detector = KNN()
detector.fit(np.array(data_lengths).reshape(-1, 1))
outliers = detector.predict(np.array(data_lengths).reshape(-1, 1))
non_outlier_lengths = [length for i, length in enumerate(data_lengths) if not outliers[i]]
transformations = [np.log, np.sqrt, lambda x: x]
bandwidths = np.linspace(0.1, 1.0, 10)
best_params = {}
best_mse = float('inf')
best_kde = None
for transform in transformations:
transformed_data = transform(non_outlier_lengths)
transformed_data = np.array(transformed_data)
grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=5)
grid.fit(transformed_data.reshape(-1, 1))
kde = grid.best_estimator_
mse = kde_mse(transformed_data, kde)
if mse < best_mse:
best_mse = mse
best_params = {'transformation': transform.__name__, 'bandwidth': kde.bandwidth}
best_kde = kde
transformation_functions = {'log': np.log, 'sqrt': np.sqrt, '<lambda>': lambda x: x}
best_transformation = transformation_functions[best_params['transformation']]
# Finding the peak of the KDE
x_d = np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 1000)
best_transformed_data = best_transformation(x_d).reshape(-1, 1)
kde_pdf = np.exp(best_kde.score_samples(best_transformed_data))
plt.plot(x_d,kde_pdf)
plt.show()
peak = np.argmax(kde_pdf)
peak_value = kde_pdf[peak]
peak_data_length = x_d[peak]
# Focus on the right half of the KDE curve
right_half_x = x_d[peak:]
right_half_pdf = kde_pdf[peak:]
# Compute the EMA for the right half
n = len(right_half_x)
ema_y = np.zeros(n)
alpha = 2 / (n + 1)
ema_y[0] = right_half_pdf[0]
for i in range(1, n):
ema_y[i] = alpha * right_half_pdf[i] + (1 - alpha) * ema_y[i - 1]
plt.plot(right_half_x,right_half_pdf)
plt.plot(right_half_x,ema_y)
plt.show()
# Finding the minimum rate of change in EMA
ROC = pd.DataFrame(right_half_pdf).diff(1).dropna()
convex_index = np.argmin(ROC.values) + 1
optimal = int(np.round(peak_data_length + right_half_x[convex_index], 0))-1
filtered = np.sort([l for l in data_lengths if l <= optimal])
# Visualizations
plt.hist(filtered)
plt.show()# Plotting the KDE, its peak, and the cutoff point
plt.figure(figsize=(12, 6))
plt.plot(x_d, kde_pdf, 'r-', label='KDE')
plt.axvline(peak_data_length, color='blue', linestyle='--', label=f'KDE Peak at {peak_data_length:.2f}')
if optimal is not None:
plt.axvline(optimal, color='g', linestyle='--', label=f'New Cutoff Point at x={optimal:.2f}')
plt.title('KDE, Peak, and Cutoff Point')
plt.xlabel('Length of Quotes')
plt.ylabel('Kernel Density Estimate (KDE)')
plt.legend()
plt.grid(True)
print(np.min(non_outlier_lengths),optimal)
print(f"Proportion of quotes within optimal threshold: {len(filtered)/len(data_lengths)}")
# Step 2: Calculate the expected value (mean length) using the KDE PDF
expected_value = simps(kde_pdf * x_d, x_d) / simps(kde_pdf, x_d)
# Step 3: Multiply the expected value by n to estimate the total length
estimated_total_length = og_len * expected_value
print(estimated_total_length)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment