Skip to content

Instantly share code, notes, and snippets.

View thistleknot's full-sized avatar

Turning out data tricks since 2006! thistleknot

View GitHub Profile
@thistleknot
thistleknot / loreft.py
Last active April 20, 2024 08:17
loreft continued pretraining using completion
import torch
import transformers
import pyreft
import os
from datasets import load_dataset
import pandas as pd
#pd.DataFrame([len(q) for q in quotes]).describe()
#pd.DataFrame([len(q) for q in quotes]).hist()
import numpy as np
@thistleknot
thistleknot / script.py
Last active March 2, 2024 02:19
text-generation-webui extension - RAG google/duckduckgo search (async) w faiss
#for data txt files see: https://github.com/TheCynosure/smmry_impl
#example use
"""
Search_web("history of Taco Tuesday")
Tell me about this.
"""
#get google api keys'
#https://console.cloud.google.com/apis/dashboard
#https://programmablesearchengine.google.com/controlpanel/all
#could be retooled quite easily to use duckduckgo_search rather than google and you don't have to mess with getting api key's
@thistleknot
thistleknot / yahoo_finance.py
Last active February 11, 2024 21:25
how to pull yahoo finance data
def get_v1_url(symbol, period_type, crumb):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
period1 = 493590046
period2 = 1913180947
@thistleknot
thistleknot / minimum nanogpt mamba
Last active January 27, 2024 18:48
minimum nanogpt mamba
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.parameter import Parameter
from tqdm import tqdm
from mamba_ssm import Mamba
#hyperparams
epochs = 100
lr = 1e-3
batch_size = 64
@thistleknot
thistleknot / train_mamba.py
Last active January 22, 2024 05:05
Train Mamba
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import wandb
from datasets import load_dataset
import torch
import os
import argparse
import numpy as np
import pandas as pd
from transformers import EvalPrediction
from torch.utils.data import DataLoader
@thistleknot
thistleknot / efficient_batching_v2.py
Last active January 19, 2024 02:44
Efficient Batching v2
#This method deducts from the list sent in (splitting the records between sample and remainder).
#Always 100% full of data until no more samples can be extracted where an empty sample along with the remainder are returned [where the remainder is to be folded into a new iteration]
# Function to find the combination of values that adds up to the target sum
def find_combination_to_sum(counts, target):
#print("Target inside function (find_combination_to_sum):", target)
values = []
for val, count in counts.items():
#print(f"Value (val): {val}, Type: {type(val)}")
#print(f"Count: {count}, Type: {type(count)}")
@thistleknot
thistleknot / GPT-Neo_Classify.py
Last active January 14, 2024 04:30
Gpt-neo Classify
from transformers import GPT2Tokenizer, GPTNeoForCausalLM
import torch
import torch.nn.functional as F
# Load the GPT-Neo 1.3B model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
# Your question and prompt
question = "Is a bird a mammal?"
@thistleknot
thistleknot / gpt_classify.py
Created January 14, 2024 03:41
GPT Classify
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn.functional as F
# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
# Your question and prompt
question = "Is a bird a mammal?"
@thistleknot
thistleknot / optimal_cut.py
Last active January 13, 2024 01:48
Optimal Cut
#!pip install --upgrade numpy
!pip install numpy==1.24
from datasets import load_dataset
from pyod.models.knn import KNN
from pyod.models.knn import KNN # Example: You can use K-Nearest Neighbors as an ECOD model
from scipy import stats
from scipy.interpolate import UnivariateSpline
from scipy.stats import gaussian_kde
@thistleknot
thistleknot / ECOD_KDE_95_pct_quotes.py
Created January 6, 2024 01:19
ECOD pruned 95% of records using KDE
#!/usr/bin/env python
# coding: utf-8
# In[3]:
#!pip install --upgrade numpy
get_ipython().system('pip install numpy==1.24')
# In[4]: