Turning out data tricks since 2006! thistleknot

## loreft.py
import torch
import transformers
import pyreft
import os
from datasets import load_dataset
import pandas as pd
#pd.DataFrame([len(q) for q in quotes]).describe()
#pd.DataFrame([len(q) for q in quotes]).hist()
import numpy as np

## datasets.txt
Target
    Phi 1 - 7 Billion
    #https://clarifai.com/microsoft/text-generation/models/phi-1_5
    Phi-1.5 was trained on 150 billion tokens, with 20% from phi-1's training data(7B tokens) and 80% from the newly created synthetic, “textbook-like” data (roughly 20B tokens) for the purpose of teaching common sense reasoning and general knowledge of the world (science, daily activities, theory of mind, etc.).

Base Model
    X marksverdhei/wordnet-definitions-en-2021
    X Wiki-text
    X idioms
    X sep

## gpt2api
from fastapi import FastAPI, Depends
from pydantic import BaseModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from typing import List
import time
from threading import Thread, Lock
import torch

app = FastAPI()

## script.py
#for data txt files see: https://github.com/TheCynosure/smmry_impl
#example use
"""
Search_web("history of Taco Tuesday")
Tell me about this.
"""
#get google api keys'
#https://console.cloud.google.com/apis/dashboard
#https://programmablesearchengine.google.com/controlpanel/all
#could be retooled quite easily to use duckduckgo_search rather than google and you don't have to mess with getting api key's

## yahoo_finance.py
def get_v1_url(symbol, period_type, crumb):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
    }

    period1 = 493590046
    period2 = 1913180947

## minimum nanogpt mamba
import torch
import torch.nn as nn
from torch.nn import  functional as F
from torch.nn.parameter import Parameter
from tqdm import tqdm
from mamba_ssm import Mamba
#hyperparams
epochs = 100
lr = 1e-3
batch_size = 64

## mamba-gpt.py
# -*- coding: utf-8 -*-
"""SimplerMambaSSM.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY
"""

#!pip install mamba-ssm causal-conv1d

## Mamba-gpt-w-sub-word.py
# -*- coding: utf-8 -*-
"""SimplerMambaSSM.ipynb
Automatically generated by Colaboratory.
#pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
Original file is located at
    https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY
"""
#!pip install mamba-ssm causal-conv1d
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#!mkdir differentattention

## train_mamba.py
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import wandb
from datasets import load_dataset
import torch
import os
import argparse
import numpy as np
import pandas as pd
from transformers import EvalPrediction
from torch.utils.data import DataLoader

## efficient_batching_v2.py
#This method deducts from the list sent in (splitting the records between sample and remainder).
#Always 100% full of data until no more samples can be extracted where an empty sample along with the remainder are returned [where the remainder is to be folded into a new iteration]

# Function to find the combination of values that adds up to the target sum
def find_combination_to_sum(counts, target):
    #print("Target inside function (find_combination_to_sum):", target)
    values = []
    for val, count in counts.items():
        #print(f"Value (val): {val}, Type: {type(val)}")
        #print(f"Count: {count}, Type: {type(count)}")
	import torch
	import transformers
	import pyreft
	import os
	from datasets import load_dataset
	import pandas as pd
	#pd.DataFrame([len(q) for q in quotes]).describe()
	#pd.DataFrame([len(q) for q in quotes]).hist()
	import numpy as np
	Target
	Phi 1 - 7 Billion
	#https://clarifai.com/microsoft/text-generation/models/phi-1_5
	Phi-1.5 was trained on 150 billion tokens, with 20% from phi-1's training data(7B tokens) and 80% from the newly created synthetic, “textbook-like” data (roughly 20B tokens) for the purpose of teaching common sense reasoning and general knowledge of the world (science, daily activities, theory of mind, etc.).

	Base Model
	X marksverdhei/wordnet-definitions-en-2021
	X Wiki-text
	X idioms
	X sep
	from fastapi import FastAPI, Depends
	from pydantic import BaseModel
	from transformers import GPT2LMHeadModel, GPT2Tokenizer
	from typing import List
	import time
	from threading import Thread, Lock
	import torch

	app = FastAPI()
	#for data txt files see: https://github.com/TheCynosure/smmry_impl
	#example use
	"""
	Search_web("history of Taco Tuesday")
	Tell me about this.
	"""
	#get google api keys'
	#https://console.cloud.google.com/apis/dashboard
	#https://programmablesearchengine.google.com/controlpanel/all
	#could be retooled quite easily to use duckduckgo_search rather than google and you don't have to mess with getting api key's
	def get_v1_url(symbol, period_type, crumb):
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	}

	period1 = 493590046
	period2 = 1913180947
	import torch
	import torch.nn as nn
	from torch.nn import functional as F
	from torch.nn.parameter import Parameter
	from tqdm import tqdm
	from mamba_ssm import Mamba
	#hyperparams
	epochs = 100
	lr = 1e-3
	batch_size = 64
	# -- coding: utf-8 --
	"""SimplerMambaSSM.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1g9qpeVcFa0ca0cnhmqusO4RZtQdh9umY
	"""

	#!pip install mamba-ssm causal-conv1d
	from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
	import wandb
	from datasets import load_dataset
	import torch
	import os
	import argparse
	import numpy as np
	import pandas as pd
	from transformers import EvalPrediction
	from torch.utils.data import DataLoader
	#This method deducts from the list sent in (splitting the records between sample and remainder).
	#Always 100% full of data until no more samples can be extracted where an empty sample along with the remainder are returned [where the remainder is to be folded into a new iteration]

	# Function to find the combination of values that adds up to the target sum
	def find_combination_to_sum(counts, target):
	#print("Target inside function (find_combination_to_sum):", target)
	values = []
	for val, count in counts.items():
	#print(f"Value (val): {val}, Type: {type(val)}")
	#print(f"Count: {count}, Type: {type(count)}")