Skip to content

Instantly share code, notes, and snippets.

View thistleknot's full-sized avatar

Turning out data tricks since 2006! thistleknot

View GitHub Profile
@thistleknot
thistleknot / ECOD_KDE_95_pct_quotes.py
Created January 6, 2024 01:19
ECOD pruned 95% of records using KDE
#!/usr/bin/env python
# coding: utf-8
# In[3]:
#!pip install --upgrade numpy
get_ipython().system('pip install numpy==1.24')
# In[4]:
@thistleknot
thistleknot / efficient_batching.py
Last active January 12, 2024 03:59
efficient batching
def create_batches(records, block_size, num_batches, eos_token_id):
random.shuffle(records)
# Adding eos_token_id to each record and then checking if it fits in the block
available_records = [[i, record + [eos_token_id]] for i, record in enumerate(records) if len(record) + 1 <= block_size]
def fill_sequence(sequence, available_records, space_avail):
if not available_records or space_avail <= 0:
return sequence, available_records, space_avail
@thistleknot
thistleknot / mamba_attn_fff.py
Last active December 29, 2023 00:17
improved mamba with attention and fff
#https://gist.githubusercontent.com/thistleknot/raw/mamba_trainer.py
#SimplerMambaSSM
#https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY#scrollTo=2lECw6S4N7cn
#!pip install mamba-ssm causal-conv1d
#resources
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#https://github.com/havenhq/mamba-chat/blob/main/trainer/mamba_trainer.py
#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
@thistleknot
thistleknot / mamba_trainer.py
Last active December 28, 2023 19:15
mamba trainer
#https://gist.githubusercontent.com/thistleknot/raw/mamba_trainer.py
#!pip install mamba-ssm causal-conv1d
#resources
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#https://github.com/havenhq/mamba-chat/blob/main/trainer/mamba_trainer.py
#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
#https://github.com/state-spaces/mamba/blob/main/mamba_ssm/modules/mamba_simple.py
#https://huggingface.co/clibrain/mamba-2.8b-instruct-openhermes
@thistleknot
thistleknot / tfidf_summ.py
Created December 24, 2023 00:22
tfidf summarizer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scipy.stats as stats
original_text = [
"Don't incur technical debt, fully define what is proposed.",
"Prefer O'Reilly style writing using examples of time-tested failproof boilerplate solutions with docstring comments.",
"Assume user's expertise: Masters in Data Science, Classical Philosophy, and proficiency in AI, Python, SQL.",
"Always deliver production ready code.",
@thistleknot
thistleknot / Mamba-gpt-w-sub-word.py
Last active January 25, 2024 08:55
Mamba Gpt w Sub Word tokenizer
# -*- coding: utf-8 -*-
"""SimplerMambaSSM.ipynb
Automatically generated by Colaboratory.
#pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
Original file is located at
https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY
"""
#!pip install mamba-ssm causal-conv1d
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#!mkdir differentattention
@thistleknot
thistleknot / mamba-gpt.py
Last active January 26, 2024 08:18
Mamba GPT
# -*- coding: utf-8 -*-
"""SimplerMambaSSM.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY
"""
#!pip install mamba-ssm causal-conv1d
@thistleknot
thistleknot / llm inference.py
Last active December 12, 2023 00:08
basic inference
#export LD_LIBRARY_PATH=/usr/local/cuda/targets/x86_64-linux/lib/
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from datasets import load_dataset
import json
import torch
from tqdm import tqdm
if torch.cuda.is_available():
@thistleknot
thistleknot / dataset_distillation_v5_gpu.py
Last active December 9, 2023 23:10
dataset distillation v5
#!/usr/bin/env python
# coding: utf-8
import torch
import torch.nn.functional as F
from transformers import GPTNeoForCausalLM, AutoTokenizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
@thistleknot
thistleknot / dataset_distillation_v4.py
Created December 9, 2023 04:21
Dataset Distill v4
#!/usr/bin/env python
# coding: utf-8
import torch
import torch.nn.functional as F
from transformers import GPTNeoForCausalLM, AutoTokenizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np