Skip to content

Instantly share code, notes, and snippets.

View thistleknot's full-sized avatar

Turning out data tricks since 2006! thistleknot

View GitHub Profile
@thistleknot
thistleknot / efficient_batching_v2.py
Last active January 19, 2024 02:44
Efficient Batching v2
#This method deducts from the list sent in (splitting the records between sample and remainder).
#Always 100% full of data until no more samples can be extracted where an empty sample along with the remainder are returned [where the remainder is to be folded into a new iteration]
# Function to find the combination of values that adds up to the target sum
def find_combination_to_sum(counts, target):
#print("Target inside function (find_combination_to_sum):", target)
values = []
for val, count in counts.items():
#print(f"Value (val): {val}, Type: {type(val)}")
#print(f"Count: {count}, Type: {type(count)}")
@thistleknot
thistleknot / GPT-Neo_Classify.py
Last active January 14, 2024 04:30
Gpt-neo Classify
from transformers import GPT2Tokenizer, GPTNeoForCausalLM
import torch
import torch.nn.functional as F
# Load the GPT-Neo 1.3B model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
# Your question and prompt
question = "Is a bird a mammal?"
@thistleknot
thistleknot / gpt_classify.py
Created January 14, 2024 03:41
GPT Classify
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn.functional as F
# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
# Your question and prompt
question = "Is a bird a mammal?"
@thistleknot
thistleknot / optimal_cut.py
Last active January 13, 2024 01:48
Optimal Cut
#!pip install --upgrade numpy
!pip install numpy==1.24
from datasets import load_dataset
from pyod.models.knn import KNN
from pyod.models.knn import KNN # Example: You can use K-Nearest Neighbors as an ECOD model
from scipy import stats
from scipy.interpolate import UnivariateSpline
from scipy.stats import gaussian_kde
@thistleknot
thistleknot / efficient_batching.py
Last active January 12, 2024 03:59
efficient batching
def create_batches(records, block_size, num_batches, eos_token_id):
random.shuffle(records)
# Adding eos_token_id to each record and then checking if it fits in the block
available_records = [[i, record + [eos_token_id]] for i, record in enumerate(records) if len(record) + 1 <= block_size]
def fill_sequence(sequence, available_records, space_avail):
if not available_records or space_avail <= 0:
return sequence, available_records, space_avail
@thistleknot
thistleknot / long_tiny_mistral_2-2b_blank.py
Last active January 8, 2024 02:47
long mistral 2-2b
import json
import argparse
from transformers import MistralConfig, AutoModelForCausalLM
import torch
import sys
import os
def calculate_model_parameters(config):
# Load the model configuration from the JSON file
# Extract the necessary values from the configuration
@thistleknot
thistleknot / ECOD_KDE_95_pct_quotes.py
Created January 6, 2024 01:19
ECOD pruned 95% of records using KDE
#!/usr/bin/env python
# coding: utf-8
# In[3]:
#!pip install --upgrade numpy
get_ipython().system('pip install numpy==1.24')
# In[4]:
@thistleknot
thistleknot / mamba_attn_fff.py
Last active December 29, 2023 00:17
improved mamba with attention and fff
#https://gist.githubusercontent.com/thistleknot/raw/mamba_trainer.py
#SimplerMambaSSM
#https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY#scrollTo=2lECw6S4N7cn
#!pip install mamba-ssm causal-conv1d
#resources
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#https://github.com/havenhq/mamba-chat/blob/main/trainer/mamba_trainer.py
#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
@thistleknot
thistleknot / mamba_trainer.py
Last active December 28, 2023 19:15
mamba trainer
#https://gist.githubusercontent.com/thistleknot/raw/mamba_trainer.py
#!pip install mamba-ssm causal-conv1d
#resources
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#https://github.com/havenhq/mamba-chat/blob/main/trainer/mamba_trainer.py
#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/modules/mamba_simple.py
#https://huggingface.co/clibrain/mamba-2.8b-instruct-openhermes
@thistleknot
thistleknot / tfidf_summ.py
Created December 24, 2023 00:22
tfidf summarizer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scipy.stats as stats
original_text = [
"Don't incur technical debt, fully define what is proposed.",
"Prefer O'Reilly style writing using examples of time-tested failproof boilerplate solutions with docstring comments.",
"Assume user's expertise: Masters in Data Science, Classical Philosophy, and proficiency in AI, Python, SQL.",
"Always deliver production ready code.",