Skip to content

Instantly share code, notes, and snippets.

View thistleknot's full-sized avatar

Turning out data tricks since 2006! thistleknot

View GitHub Profile
@thistleknot
thistleknot / loreft.py
Last active April 20, 2024 08:17
loreft continued pretraining using completion
import torch
import transformers
import pyreft
import os
from datasets import load_dataset
import pandas as pd
#pd.DataFrame([len(q) for q in quotes]).describe()
#pd.DataFrame([len(q) for q in quotes]).hist()
import numpy as np
@thistleknot
thistleknot / datasets.txt
Last active March 25, 2024 07:14
datasets
Target
Phi 1 - 7 Billion
#https://clarifai.com/microsoft/text-generation/models/phi-1_5
Phi-1.5 was trained on 150 billion tokens, with 20% from phi-1's training data(7B tokens) and 80% from the newly created synthetic, “textbook-like” data (roughly 20B tokens) for the purpose of teaching common sense reasoning and general knowledge of the world (science, daily activities, theory of mind, etc.).
Base Model
X marksverdhei/wordnet-definitions-en-2021
X Wiki-text
X idioms
X sep
@thistleknot
thistleknot / gpt2api
Created August 13, 2023 18:47
GPT2 Batching API
from fastapi import FastAPI, Depends
from pydantic import BaseModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from typing import List
import time
from threading import Thread, Lock
import torch
app = FastAPI()
@thistleknot
thistleknot / script.py
Last active March 2, 2024 02:19
text-generation-webui extension - RAG google/duckduckgo search (async) w faiss
#for data txt files see: https://github.com/TheCynosure/smmry_impl
#example use
"""
Search_web("history of Taco Tuesday")
Tell me about this.
"""
#get google api keys'
#https://console.cloud.google.com/apis/dashboard
#https://programmablesearchengine.google.com/controlpanel/all
#could be retooled quite easily to use duckduckgo_search rather than google and you don't have to mess with getting api key's
@thistleknot
thistleknot / yahoo_finance.py
Last active February 11, 2024 21:25
how to pull yahoo finance data
def get_v1_url(symbol, period_type, crumb):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
period1 = 493590046
period2 = 1913180947
@thistleknot
thistleknot / minimum nanogpt mamba
Last active January 27, 2024 18:48
minimum nanogpt mamba
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.parameter import Parameter
from tqdm import tqdm
from mamba_ssm import Mamba
#hyperparams
epochs = 100
lr = 1e-3
batch_size = 64
@thistleknot
thistleknot / mamba-gpt.py
Last active January 26, 2024 08:18
Mamba GPT
# -*- coding: utf-8 -*-
"""SimplerMambaSSM.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY
"""
#!pip install mamba-ssm causal-conv1d
@thistleknot
thistleknot / Mamba-gpt-w-sub-word.py
Last active January 25, 2024 08:55
Mamba Gpt w Sub Word tokenizer
# -*- coding: utf-8 -*-
"""SimplerMambaSSM.ipynb
Automatically generated by Colaboratory.
#pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
Original file is located at
https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY
"""
#!pip install mamba-ssm causal-conv1d
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#!mkdir differentattention
@thistleknot
thistleknot / train_mamba.py
Last active January 22, 2024 05:05
Train Mamba
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import wandb
from datasets import load_dataset
import torch
import os
import argparse
import numpy as np
import pandas as pd
from transformers import EvalPrediction
from torch.utils.data import DataLoader
@thistleknot
thistleknot / efficient_batching_v2.py
Last active January 19, 2024 02:44
Efficient Batching v2
#This method deducts from the list sent in (splitting the records between sample and remainder).
#Always 100% full of data until no more samples can be extracted where an empty sample along with the remainder are returned [where the remainder is to be folded into a new iteration]
# Function to find the combination of values that adds up to the target sum
def find_combination_to_sum(counts, target):
#print("Target inside function (find_combination_to_sum):", target)
values = []
for val, count in counts.items():
#print(f"Value (val): {val}, Type: {type(val)}")
#print(f"Count: {count}, Type: {type(count)}")