Skip to content

Instantly share code, notes, and snippets.

View thistleknot's full-sized avatar

Turning out data tricks since 2006! thistleknot

View GitHub Profile
@thistleknot
thistleknot / dataset_distillation.py
Last active December 6, 2023 05:19
Dataset Distillation v3
import torch
import torch.nn.functional as F
from transformers import GPTNeoForCausalLM, AutoTokenizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
# Parameters
@thistleknot
thistleknot / dd_kaggle.py
Created December 2, 2023 16:37
dataset distillation extracted from kaggle
#https://www.kaggle.com/code/samuelcortinhas/mnist-dataset-distillation
# Core
import numpy as np
np.random.seed(0)
import pandas as pd
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
import matplotlib.pyplot as plt
%matplotlib inline
import time
@thistleknot
thistleknot / parse_grep.py
Created December 2, 2023 03:49
Parse Grep
import pandas as pd
# Step 1: Read and parse the input file
parsed_data = []
with open('output.txt', 'r') as file: # Replace 'output.txt' with your file path
for line in file:
if line.strip(): # Skip empty lines
first_colon_index = line.find(':')
if first_colon_index != -1:
filepath = line[:first_colon_index].strip()
@thistleknot
thistleknot / tensorboard.py
Last active December 5, 2023 07:18
tensorboard with ema embeddings
#!/usr/bin/env python
# coding: utf-8
import lade
from transformers import AutoTokenizer, AutoModel
import torch
lade.augment_all()
lade.config_lade(LEVEL=5, WINDOW_SIZE=7, GUESS_SET_SIZE=7, DEBUG=0)
from datasets import load_dataset
from torch.utils.tensorboard import SummaryWriter
@thistleknot
thistleknot / tree_of_thoughts.py
Last active November 28, 2023 06:20
Tree of Thoughts
# Revised Tree of Thought Template
## Problem Identification
- **Problem:** Clearly define the problem or decision that needs to be addressed.
## Idea Generation and Processing
- **Preparation:**
- Define the context and constraints of the problem.
- Generate probable ideas for addressing the problem.
1. example idea
@thistleknot
thistleknot / turing_prompt.txt
Created November 25, 2023 20:40
turing prompt
Problem: 2, 3, 1, 5
EXECUTION
Prep
Length of the list: 4
Number of consecutive pairs: 3
a=[2 3 1 5]
set n_swaps=0
EndPrep
Iteration:
set swap_flag=false. The state is:
@thistleknot
thistleknot / tiny_mistral.py
Last active November 22, 2023 03:50
tiny mistral
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import torch
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
@thistleknot
thistleknot / mean_based_clustering.py
Last active November 6, 2023 02:36
Mean Based Clustering
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from graphviz import Source
@thistleknot
thistleknot / gptq.py
Last active November 5, 2023 21:47
gptq quantization
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer
pretrained_model_dir = "/home/user/text-generation-webui/models/open_llama_3b_v2/"
quantized_model_dir = "/home/user/text-generation-webui/models/open_llama_3b_v2_qptq/"
quantize_config = BaseQuantizeConfig(
bits=4, # quantize model to 4-bit
group_size=32, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
@thistleknot
thistleknot / bayes_regression.py
Created November 2, 2023 03:18
bayes regression using kde
# Since we don't have internet access in this environment, I'll replicate a similar workflow.
# Let's assume the 'data' is a pandas DataFrame obtained from the CSV file.
# For the example, let's create a simulated 'Poverty' column with random data
data_ = pd.read_csv("https://raw.githubusercontent.com/thistleknot/Python-Stock/master/data/raw/states.csv?token=GHSAT0AAAAAACIYSECGQETPAPO6K4QYIFV6ZKDCJIQ").set_index('States')
for c in data_.columns:
print(c)
data = data_[[c]]