Last active
January 13, 2024 01:48
-
-
Save thistleknot/1d25d5b65f3d7255ddc8ffbd3981a0bb to your computer and use it in GitHub Desktop.
Optimal Cut
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install --upgrade numpy | |
!pip install numpy==1.24 | |
from datasets import load_dataset | |
from pyod.models.knn import KNN | |
from pyod.models.knn import KNN # Example: You can use K-Nearest Neighbors as an ECOD model | |
from scipy import stats | |
from scipy.interpolate import UnivariateSpline | |
from scipy.stats import gaussian_kde | |
from sklearn.metrics import mean_squared_error | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.neighbors import KernelDensity | |
from sklearn.preprocessing import FunctionTransformer | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
from transformers import AutoTokenizer | |
import json | |
import random | |
from scipy.integrate import simps | |
import numpy as np | |
from sklearn.neighbors import LocalOutlierFactor | |
from textblob import TextBlob | |
import nltk | |
from nltk.corpus import brown | |
from sklearn.model_selection import train_test_split | |
#model.fit(test, metric='euclidean') | |
# Function to tokenize and filter | |
def tokenize_and_filter(dataset, min_size, max_size): | |
tokenized = tokenizer.batch_encode_plus(dataset)['input_ids'] | |
filtered = [tokens for tokens in tokenized if min_size <= len(tokens) <= max_size] | |
return filtered | |
# Define the ECDF function | |
def ecdf2(values): | |
co = len(values) | |
externalArray = pd.DataFrame() | |
for d in range(0, len(values.columns)): | |
internalArray = [] | |
for i in range(0, len(values.iloc[:, d])): | |
a = ( | |
sum(values.iloc[:, d] <= values.iloc[:, d][i]) + | |
sum(values.iloc[:, d] < values.iloc[:, d][i]) | |
) / 2 / co | |
internalArray.append(a) | |
externalArray = pd.concat( | |
[externalArray, pd.DataFrame(internalArray).round(2)], axis=1 | |
) | |
return externalArray | |
# Define a scoring function using mean squared error (MSE) | |
def mse_scoring_function(kde, data): | |
estimated_density = kde(data) | |
true_ecdf = ecdf2(pd.DataFrame(data)) # Calculate true ECDF | |
mse = mean_squared_error(true_ecdf, estimated_density) | |
return mse | |
# KDE Optimization | |
def kde_mse(x, kde): | |
pdf = kde.score_samples(x.reshape(-1, 1)) | |
mse = mean_squared_error(np.zeros_like(x), pdf) | |
return mse | |
def sentence_ngrams(text, n): | |
blob = TextBlob(text) | |
sentences = [str(sentence) for sentence in blob.sentences] | |
return [sentences[i:i+n] for i in range(len(sentences)-n+1)] | |
def create_batches(records, block_size, num_batches, eos_token_id): | |
random.shuffle(records) | |
# Adding eos_token_id to each record and then checking if it fits in the block | |
available_records = [[i, record + [eos_token_id]] for i, record in enumerate(records) if len(record) + 1 <= block_size] | |
def fill_sequence(sequence, available_records, space_avail): | |
if not available_records or space_avail <= 0: | |
return sequence, available_records, space_avail | |
records_to_remove = [] | |
for idx, record in available_records: | |
record_length = len(record) # record already includes the eos_token_id | |
if record_length <= space_avail: | |
sequence.append(record) | |
space_avail -= record_length | |
records_to_remove.append(idx) | |
if space_avail <= 0: | |
break | |
# If space is still available, try swapping out the first record | |
if space_avail > 0 and len(sequence) > 0: | |
removed_record = sequence.pop(0) | |
available_records.append((None, removed_record)) | |
space_avail += len(removed_record) | |
available_records = [item for item in available_records if item[0] not in records_to_remove] | |
return fill_sequence(sequence, available_records, space_avail) | |
else: | |
available_records = [item for item in available_records if item[0] not in records_to_remove] | |
return sequence, available_records, space_avail | |
sequences = [] | |
space_avail = block_size | |
while available_records and len(sequences) < num_batches: | |
sequence, available_records, space_avail = fill_sequence([], available_records, space_avail) | |
sequences.append(sequence) | |
space_avail = block_size | |
return sequences, len(available_records), [np.sum([len(t) for t in t_]) for t_ in sequences] | |
# Load the dataset | |
dataset = load_dataset("Abirate/english_quotes") | |
quotes = dataset['train']['quote'] | |
quotes = pd.DataFrame([q.replace('”','').replace('“','') for q in quotes]).dropna().values | |
# Initialize data with quotes | |
data = list(quotes) | |
# Read and convert data from 'graciousquotes.csv' into strings | |
gracious_quotes = pd.read_csv('graciousquotes.csv', index_col=0).dropna()['0'].astype(str).tolist() | |
data.extend(gracious_quotes) | |
# Read and convert data from 'az_quotes.csv' into strings | |
az_quotes = pd.read_csv('az_quotes.csv', index_col=0).dropna()['0'].astype(str).tolist() | |
data.extend(az_quotes) | |
# Randomly sample 1000 items from the combined list | |
#data = random.sample(data, 1000) | |
# Remove None values and empty strings from the list | |
data = [item for item in data if item is not None and item != ""] | |
# Convert all elements to strings (if they are not already) | |
data = [str(item) for item in data] | |
data_ = random.sample(data,1000) | |
exs = load_dataset("marksverdhei/wordnet-definitions-en-2021")['train']['Example'] | |
# Load the dataset | |
defs = load_dataset("marksverdhei/wordnet-definitions-en-2021")['train']['Definition'] | |
# List to hold the titles | |
idioms = [] | |
json_file_path = 'extracted_idioms.json' | |
# Read and parse the JSON file | |
with open(json_file_path, 'r') as file: | |
for line in file: | |
json_obj = json.loads(line) | |
idiom = json_obj['title'] | |
idioms.append(idiom) | |
if(True): | |
nltk.download("brown") | |
brown_ = [" ".join(brown.words(fileid)) for fileid in brown.fileids()] | |
wiki = load_dataset("EleutherAI/wikitext_document_level",'wikitext-103-v1')['train']['page'] | |
#wiki = random.sample(dataset['train']['page'],1000) | |
essays = load_dataset("qwedsacf/ivypanda-essays")['train']['TEXT'] | |
pre_data = wiki | |
og_len = len(pre_data.copy()) | |
pre_data = random.sample(pre_data,1000) | |
#data = [] | |
#for b in pre_data: | |
#print(b) | |
#blob = TextBlob(b) | |
#blob.tags # [('The', 'DT'), ('titular', 'JJ'), | |
# ('threat', 'NN'), ('of', 'IN'), ...] | |
#blob.noun_phrases # WordList(['titular threat', 'blob', | |
# 'ultimate movie monster', | |
# 'amoeba-like mass', ...]) | |
#for sentence in blob.sentences: | |
#data.append(str(sentence)) | |
data = [] | |
# Generate sentence bigrams (n=2) | |
for text in pre_data: | |
sentence_bigrams = sentence_ngrams(text, 2) | |
# Display the sentence bigrams | |
for bigram in sentence_bigrams: | |
data.append(" ".join(bigram)) | |
og_len = len(data.copy()) | |
data = random.sample(data, 1000) | |
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") | |
tokenizer.eos_token = "<|endoftext|>" | |
tokenizer.pad_token = tokenizer.eos_token | |
# Get the vocabulary size | |
vocab_size = len(tokenizer) | |
print("Vocabulary Size:", vocab_size) | |
#tokenized = tokenizer.batch_encode_plus(data) | |
#tokenized = tokenizer.batch_encode_plus(data)['input_ids'] | |
tokenized = tokenizer.batch_encode_plus([t for t in data])['input_ids'] | |
#data_lengths = [len(t) for t in tokenized['input_ids']] | |
data_lengths = [len(t) for t in tokenized] | |
len(data) | |
len(tokenized) | |
train_tokenized, val_tokenized = train_test_split(tokenized, train_size=.9) | |
test, _, s = create_batches(val_tokenized, 92, 24, tokenizer.eos_token_id) | |
print(og_len) | |
# Outlier detection | |
detector = KNN() | |
detector.fit(np.array(data_lengths).reshape(-1, 1)) | |
outliers = detector.predict(np.array(data_lengths).reshape(-1, 1)) | |
non_outlier_lengths = [length for i, length in enumerate(data_lengths) if not outliers[i]] | |
transformations = [np.log, np.sqrt, lambda x: x] | |
bandwidths = np.linspace(0.1, 1.0, 10) | |
best_params = {} | |
best_mse = float('inf') | |
best_kde = None | |
for transform in transformations: | |
transformed_data = transform(non_outlier_lengths) | |
transformed_data = np.array(transformed_data) | |
grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=5) | |
grid.fit(transformed_data.reshape(-1, 1)) | |
kde = grid.best_estimator_ | |
mse = kde_mse(transformed_data, kde) | |
if mse < best_mse: | |
best_mse = mse | |
best_params = {'transformation': transform.__name__, 'bandwidth': kde.bandwidth} | |
best_kde = kde | |
transformation_functions = {'log': np.log, 'sqrt': np.sqrt, '<lambda>': lambda x: x} | |
best_transformation = transformation_functions[best_params['transformation']] | |
# Finding the peak of the KDE | |
x_d = np.linspace(min(non_outlier_lengths), max(non_outlier_lengths), 1000) | |
best_transformed_data = best_transformation(x_d).reshape(-1, 1) | |
kde_pdf = np.exp(best_kde.score_samples(best_transformed_data)) | |
plt.plot(x_d,kde_pdf) | |
plt.show() | |
peak = np.argmax(kde_pdf) | |
peak_value = kde_pdf[peak] | |
peak_data_length = x_d[peak] | |
# Focus on the right half of the KDE curve | |
right_half_x = x_d[peak:] | |
right_half_pdf = kde_pdf[peak:] | |
# Compute the EMA for the right half | |
n = len(right_half_x) | |
ema_y = np.zeros(n) | |
alpha = 2 / (n + 1) | |
ema_y[0] = right_half_pdf[0] | |
for i in range(1, n): | |
ema_y[i] = alpha * right_half_pdf[i] + (1 - alpha) * ema_y[i - 1] | |
plt.plot(right_half_x,right_half_pdf) | |
plt.plot(right_half_x,ema_y) | |
plt.show() | |
# Finding the minimum rate of change in EMA | |
ROC = pd.DataFrame(right_half_pdf).diff(1).dropna() | |
convex_index = np.argmin(ROC.values) + 1 | |
optimal = int(np.round(peak_data_length + right_half_x[convex_index], 0))-1 | |
filtered = np.sort([l for l in data_lengths if l <= optimal]) | |
# Visualizations | |
plt.hist(filtered) | |
plt.show()# Plotting the KDE, its peak, and the cutoff point | |
plt.figure(figsize=(12, 6)) | |
plt.plot(x_d, kde_pdf, 'r-', label='KDE') | |
plt.axvline(peak_data_length, color='blue', linestyle='--', label=f'KDE Peak at {peak_data_length:.2f}') | |
if optimal is not None: | |
plt.axvline(optimal, color='g', linestyle='--', label=f'New Cutoff Point at x={optimal:.2f}') | |
plt.title('KDE, Peak, and Cutoff Point') | |
plt.xlabel('Length of Quotes') | |
plt.ylabel('Kernel Density Estimate (KDE)') | |
plt.legend() | |
plt.grid(True) | |
print(np.min(non_outlier_lengths),optimal) | |
print(f"Proportion of quotes within optimal threshold: {len(filtered)/len(data_lengths)}") | |
# Step 2: Calculate the expected value (mean length) using the KDE PDF | |
expected_value = simps(kde_pdf * x_d, x_d) / simps(kde_pdf, x_d) | |
# Step 3: Multiply the expected value by n to estimate the total length | |
estimated_total_length = og_len * expected_value | |
print(estimated_total_length) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment