Skip to content

Instantly share code, notes, and snippets.

@jc4p

jc4p/all.txt Secret

Last active June 1, 2023 19:21
Show Gist options
  • Save jc4p/063f9f335f6374df22982936101c11be to your computer and use it in GitHub Desktop.
Save jc4p/063f9f335f6374df22982936101c11be to your computer and use it in GitHub Desktop.
:) :) :) love that
very cool, so aiming at people whove done it already and are looking to scale up
makes sense!
they might be using MAU numbers for astrology apps compared to other apps? but even that sounds absurd
this is like the most modern disco i listen to
...
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling, Trainer
dataset = load_dataset("text", data_files="./data/all.txt")
split_dataset = dataset['train'].train_test_split(test_size=0.3)
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True)
tokenized_datasets = split_dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained('gpt2')
training_args = TrainingArguments(
output_dir="./models/gpt2-kasra",
overwrite_output_dir=True,
num_train_epochs=3, # number of training epochs
per_device_train_batch_size=32, # batch size for training
per_device_eval_batch_size=64, # batch size for evaluation
eval_steps = 400, # Number of update steps between two evaluations.
warmup_steps=500,# number of warmup steps for learning rate scheduler,
prediction_loss_only=True
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=test_dataset
)
trainer.train()
trainer.save_model()
from datasets import load_dataset
from transformers import LlamaTokenizer, LlamaForCausalLM, TrainingArguments, DataCollatorForLanguageModeling, Trainer
dataset = load_dataset("text", data_files="./data/all.txt")
split_dataset = dataset['train'].train_test_split(test_size=0.3)
tokenizer = LlamaTokenizer.from_pretrained('./models/llama-7B')
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
return tokenizer(examples['text'])
tokenized_datasets = split_dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("Loading model")
model = LlamaForCausalLM.from_pretrained('./models/llama-7B')
print("Model loaded")
training_args = TrainingArguments(
output_dir="./models/llama-kasra",
overwrite_output_dir=True,
num_train_epochs=3, # number of training epochs
# auto_find_batch_size=True,
per_device_train_batch_size=8, # batch size for training
per_device_eval_batch_size=8, # batch size for evaluation
eval_steps=400, # Number of update steps between two evaluations.
warmup_steps=500,# number of warmup steps for learning rate scheduler,
prediction_loss_only=True
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=test_dataset
)
trainer.train()
trainer.save_model()
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('./models/gpt2-kasra-2')
text = "what's your favorite star trek? i prefer "
inputs = tokenizer.encode(text, add_special_tokens=False, return_tensors='pt')
prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
# greedy decoding <-- ehh
# outputs = model.generate(inputs, max_length=128, do_sample=False, num_beams=1, temperature=0.70, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
# multinomial sampling <-- sounds semi-real and more random
#outputs = model.generate(inputs, max_length=128, do_sample=True, num_beams=1, temperature=0.85, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
# beam-search decoding <-- i like this one a lot but it feels stale
# outputs = model.generate(inputs, max_length=128, do_sample=False, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, pad_token_id=tokenizer.eos_token_id)
# beam-search multinomial sampling <-- it sounds real but also feels stale
outputs = model.generate(inputs, max_length=128, do_sample=True, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, pad_token_id=tokenizer.eos_token_id)
generated = text + tokenizer.decode(outputs[0])[prompt_length:]
#generated = tokenizer.decode(outputs[0])[prompt_length+1:]
print(generated)
# > what's your favorite star trek? i prefer urs cause theres so many good ones hahahaha but i also dont want to go to a movie theater with a bunch of people who arent good enough for me cause theyre not funny enough to me to like, make fun of me for liking them or etc etc, but \_()_/ if you had a list of the best sci-fi/fantasy/horror movies of all time i wouldnt be interested in any of them, you know? idk what theyd be like if they werent on the list and i didnt have the time to
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment