Michel Nivard MichelNivard

## gist.r
# t-test vs wilcox vs ordinal

library(tidyverse)
library(multidplyr) # parallelize
library(rms) # ordinal regression


sample_size = 500 # N

# genarate paired sets and calculate p-values with different techniques

## Variance and interactions....
## The problem with (personal) non-ordinality:

n <- 30000 # 10k fictional people

# a pair of exposures, no measurmennt issues:
xa <- rnorm(n)
xb <- rnorm(n)

# Personal threshold 1 - 4 for each person,
# reasonable scale design I think by which I mean the bins fill up sort of "normal" like this matters a lot!!

## Gottesman_h2
require(MASS)

# fixed a2 and e2 for the entire script:
a <- .87 # additive genetic variance
e <- .13 # environmental variance

# make ZM covariance, i.e. the cov is a, the var = 1
sigma_mz <- matrix(c(1,a,a,1),2,2)

# sample size (is big becuase rare traits)

## openllama-3b-bnb-4bit-training.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              2 stars
            
          
                MichelNivard
                / openllama-3b-bnb-4bit-training.ipynb
            
            
              Created
              June 12, 2023 07:24
            
              
                OpenLlama 3b bnb-4bit-training.ipynb
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## geenrate.py
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('./results')

# Set the device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

## train.py
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Set the path to the text file to fine-tune on
path_to_file = "path/to/text/file.txt"

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

## Example _transcript.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                MichelNivard
                / Example _transcript.md
            
            
              Last active
              March 6, 2023 11:52
            
              
                Example long training data
              
          
    Speaker 0:
You wrote a piece a follow-up piece to your oral history titled, there is no replacement for black Twitter. I think back in November, What do you think we lose if we lose black Twitter? Tell
Speaker 1:
me not to meet your Mac, but we lose everything. I'm John Favreau. Welcome to offline.
Speaker 0:

  
## full_text.py
cat author_manuscript_txt.incr.2022-12-19/*/*.txt > merged-file.txt


from datasets import load_dataset
dataset = load_dataset('text', data_files="merged-file.txt")
print(dataset)
dataset2 = dataset.filter(lambda x: len(x["text"]) > 500)
print(dataset2)

## little_expriment.py
from transformers import pipeline
from transformers import GPTJForCausalLM
from transformers import GPTJForCausalLM, AutoTokenizer
import torch

model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

prompt = (
     "Monday Diary Entry: Had a busy weekend and started the day tired and empty, the good weather helps lift the mood. Worked fro home and spend (too much) time on learning about language models. Had 2 or 3 productive calls, tired and prob still a bit sick today, which put me in a somewhat somber mood. Had a long bath which maybe helped?"

## chatGPT_own_PDFs
import tkinter
import customtkinter
from bs4 import BeautifulSoup

# Langchain loads:
from langchain.document_loaders import DirectoryLoader,PagedPDFSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS, Qdrant
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
	# t-test vs wilcox vs ordinal

	library(tidyverse)
	library(multidplyr) # parallelize
	library(rms) # ordinal regression


	sample_size = 500 # N

	# genarate paired sets and calculate p-values with different techniques
	## The problem with (personal) non-ordinality:

	n <- 30000 # 10k fictional people

	# a pair of exposures, no measurmennt issues:
	xa <- rnorm(n)
	xb <- rnorm(n)

	# Personal threshold 1 - 4 for each person,
	# reasonable scale design I think by which I mean the bins fill up sort of "normal" like this matters a lot!!
	require(MASS)

	# fixed a2 and e2 for the entire script:
	a <- .87 # additive genetic variance
	e <- .13 # environmental variance

	# make ZM covariance, i.e. the cov is a, the var = 1
	sigma_mz <- matrix(c(1,a,a,1),2,2)

	# sample size (is big becuase rare traits)
	import torch
	from transformers import GPT2Tokenizer, GPT2LMHeadModel

	# Load the tokenizer and model
	tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
	model = GPT2LMHeadModel.from_pretrained('./results')

	# Set the device to GPU if available, otherwise use CPU
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.to(device)
	import torch
	from torch.utils.data import Dataset, DataLoader
	from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

	# Set the path to the text file to fine-tune on
	path_to_file = "path/to/text/file.txt"

	# Load the tokenizer and model
	tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
	model = GPT2LMHeadModel.from_pretrained('gpt2')
	cat author_manuscript_txt.incr.2022-12-19//.txt > merged-file.txt



	from datasets import load_dataset
	dataset = load_dataset('text', data_files="merged-file.txt")
	print(dataset)
	dataset2 = dataset.filter(lambda x: len(x["text"]) > 500)
	print(dataset2)
	from transformers import pipeline
	from transformers import GPTJForCausalLM
	from transformers import GPTJForCausalLM, AutoTokenizer
	import torch

	model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
	tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

	prompt = (
	"Monday Diary Entry: Had a busy weekend and started the day tired and empty, the good weather helps lift the mood. Worked fro home and spend (too much) time on learning about language models. Had 2 or 3 productive calls, tired and prob still a bit sick today, which put me in a somewhat somber mood. Had a long bath which maybe helped?"
	import tkinter
	import customtkinter
	from bs4 import BeautifulSoup

	# Langchain loads:
	from langchain.document_loaders import DirectoryLoader,PagedPDFSplitter
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS, Qdrant
	from langchain.chains.qa_with_sources import load_qa_with_sources_chain