Turning out data tricks since 2006! thistleknot

## efficient_batching_v2.py
#This method deducts from the list sent in (splitting the records between sample and remainder).
#Always 100% full of data until no more samples can be extracted where an empty sample along with the remainder are returned [where the remainder is to be folded into a new iteration]

# Function to find the combination of values that adds up to the target sum
def find_combination_to_sum(counts, target):
    #print("Target inside function (find_combination_to_sum):", target)
    values = []
    for val, count in counts.items():
        #print(f"Value (val): {val}, Type: {type(val)}")
        #print(f"Count: {count}, Type: {type(count)}")

## GPT-Neo_Classify.py
from transformers import GPT2Tokenizer, GPTNeoForCausalLM
import torch
import torch.nn.functional as F

# Load the GPT-Neo 1.3B model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

# Your question and prompt
question = "Is a bird a mammal?"

## gpt_classify.py
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn.functional as F

# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Your question and prompt
question = "Is a bird a mammal?"

## optimal_cut.py
#!pip install --upgrade numpy
!pip install numpy==1.24


from datasets import load_dataset
from pyod.models.knn import KNN
from pyod.models.knn import KNN   # Example: You can use K-Nearest Neighbors as an ECOD model
from scipy import stats
from scipy.interpolate import UnivariateSpline
from scipy.stats import gaussian_kde

## efficient_batching.py
def create_batches(records, block_size, num_batches, eos_token_id):
    random.shuffle(records)

    # Adding eos_token_id to each record and then checking if it fits in the block
    available_records = [[i, record + [eos_token_id]] for i, record in enumerate(records) if len(record) + 1 <= block_size]

    def fill_sequence(sequence, available_records, space_avail):
        if not available_records or space_avail <= 0:
            return sequence, available_records, space_avail

## long_tiny_mistral_2-2b_blank.py
import json
import argparse
from transformers import MistralConfig, AutoModelForCausalLM
import torch
import sys
import os

def calculate_model_parameters(config):
    # Load the model configuration from the JSON file
    # Extract the necessary values from the configuration

## ECOD_KDE_95_pct_quotes.py
#!/usr/bin/env python
# coding: utf-8

# In[3]:

#!pip install --upgrade numpy
get_ipython().system('pip install numpy==1.24')

# In[4]:

## mamba_attn_fff.py
#https://gist.githubusercontent.com/thistleknot/raw/mamba_trainer.py
#SimplerMambaSSM
#https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY#scrollTo=2lECw6S4N7cn

#!pip install mamba-ssm causal-conv1d

#resources
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#https://github.com/havenhq/mamba-chat/blob/main/trainer/mamba_trainer.py
#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py

## mamba_trainer.py
#https://gist.githubusercontent.com/thistleknot/raw/mamba_trainer.py

#!pip install mamba-ssm causal-conv1d

#resources
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#https://github.com/havenhq/mamba-chat/blob/main/trainer/mamba_trainer.py
#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/modules/mamba_simple.py
#https://huggingface.co/clibrain/mamba-2.8b-instruct-openhermes

## tfidf_summ.py
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scipy.stats as stats

original_text = [
"Don't incur technical debt, fully define what is proposed.",
"Prefer O'Reilly style writing using examples of time-tested failproof boilerplate solutions with docstring comments.",
"Assume user's expertise: Masters in Data Science, Classical Philosophy, and proficiency in AI, Python, SQL.",
"Always deliver production ready code.",
	#This method deducts from the list sent in (splitting the records between sample and remainder).
	#Always 100% full of data until no more samples can be extracted where an empty sample along with the remainder are returned [where the remainder is to be folded into a new iteration]

	# Function to find the combination of values that adds up to the target sum
	def find_combination_to_sum(counts, target):
	#print("Target inside function (find_combination_to_sum):", target)
	values = []
	for val, count in counts.items():
	#print(f"Value (val): {val}, Type: {type(val)}")
	#print(f"Count: {count}, Type: {type(count)}")
	from transformers import GPT2Tokenizer, GPTNeoForCausalLM
	import torch
	import torch.nn.functional as F

	# Load the GPT-Neo 1.3B model and tokenizer
	tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
	model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

	# Your question and prompt
	question = "Is a bird a mammal?"
	#!pip install --upgrade numpy
	!pip install numpy==1.24


	from datasets import load_dataset
	from pyod.models.knn import KNN
	from pyod.models.knn import KNN # Example: You can use K-Nearest Neighbors as an ECOD model
	from scipy import stats
	from scipy.interpolate import UnivariateSpline
	from scipy.stats import gaussian_kde
	def create_batches(records, block_size, num_batches, eos_token_id):
	random.shuffle(records)

	# Adding eos_token_id to each record and then checking if it fits in the block
	available_records = [[i, record + [eos_token_id]] for i, record in enumerate(records) if len(record) + 1 <= block_size]

	def fill_sequence(sequence, available_records, space_avail):
	if not available_records or space_avail <= 0:
	return sequence, available_records, space_avail
	import json
	import argparse
	from transformers import MistralConfig, AutoModelForCausalLM
	import torch
	import sys
	import os

	def calculate_model_parameters(config):
	# Load the model configuration from the JSON file
	# Extract the necessary values from the configuration
	#!/usr/bin/env python
	# coding: utf-8

	# In[3]:

	#!pip install --upgrade numpy
	get_ipython().system('pip install numpy==1.24')

	# In[4]:
	#https://gist.githubusercontent.com/thistleknot/raw/mamba_trainer.py
	#SimplerMambaSSM
	#https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY#scrollTo=2lECw6S4N7cn

	#!pip install mamba-ssm causal-conv1d

	#resources
	#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
	#https://github.com/havenhq/mamba-chat/blob/main/trainer/mamba_trainer.py
	#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
	from textblob import TextBlob
	from sklearn.feature_extraction.text import TfidfVectorizer
	import numpy as np
	import scipy.stats as stats

	original_text = [
	"Don't incur technical debt, fully define what is proposed.",
	"Prefer O'Reilly style writing using examples of time-tested failproof boilerplate solutions with docstring comments.",
	"Assume user's expertise: Masters in Data Science, Classical Philosophy, and proficiency in AI, Python, SQL.",
	"Always deliver production ready code.",