-
-
Save jc4p/063f9f335f6374df22982936101c11be to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
:) :) :) love that | |
very cool, so aiming at people whove done it already and are looking to scale up | |
makes sense! | |
they might be using MAU numbers for astrology apps compared to other apps? but even that sounds absurd | |
this is like the most modern disco i listen to | |
... |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset | |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling, Trainer | |
dataset = load_dataset("text", data_files="./data/all.txt") | |
split_dataset = dataset['train'].train_test_split(test_size=0.3) | |
tokenizer = AutoTokenizer.from_pretrained('gpt2') | |
tokenizer.pad_token = tokenizer.eos_token | |
def tokenize_function(examples): | |
return tokenizer(examples['text'], truncation=True) | |
tokenized_datasets = split_dataset.map(tokenize_function, batched=True) | |
train_dataset = tokenized_datasets['train'] | |
test_dataset = tokenized_datasets['test'] | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
model = AutoModelForCausalLM.from_pretrained('gpt2') | |
training_args = TrainingArguments( | |
output_dir="./models/gpt2-kasra", | |
overwrite_output_dir=True, | |
num_train_epochs=3, # number of training epochs | |
per_device_train_batch_size=32, # batch size for training | |
per_device_eval_batch_size=64, # batch size for evaluation | |
eval_steps = 400, # Number of update steps between two evaluations. | |
warmup_steps=500,# number of warmup steps for learning rate scheduler, | |
prediction_loss_only=True | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
data_collator=data_collator, | |
train_dataset=train_dataset, | |
eval_dataset=test_dataset | |
) | |
trainer.train() | |
trainer.save_model() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset | |
from transformers import LlamaTokenizer, LlamaForCausalLM, TrainingArguments, DataCollatorForLanguageModeling, Trainer | |
dataset = load_dataset("text", data_files="./data/all.txt") | |
split_dataset = dataset['train'].train_test_split(test_size=0.3) | |
tokenizer = LlamaTokenizer.from_pretrained('./models/llama-7B') | |
tokenizer.pad_token = tokenizer.eos_token | |
def tokenize_function(examples): | |
return tokenizer(examples['text']) | |
tokenized_datasets = split_dataset.map(tokenize_function, batched=True) | |
train_dataset = tokenized_datasets['train'] | |
test_dataset = tokenized_datasets['test'] | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
print("Loading model") | |
model = LlamaForCausalLM.from_pretrained('./models/llama-7B') | |
print("Model loaded") | |
training_args = TrainingArguments( | |
output_dir="./models/llama-kasra", | |
overwrite_output_dir=True, | |
num_train_epochs=3, # number of training epochs | |
# auto_find_batch_size=True, | |
per_device_train_batch_size=8, # batch size for training | |
per_device_eval_batch_size=8, # batch size for evaluation | |
eval_steps=400, # Number of update steps between two evaluations. | |
warmup_steps=500,# number of warmup steps for learning rate scheduler, | |
prediction_loss_only=True | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
data_collator=data_collator, | |
train_dataset=train_dataset, | |
eval_dataset=test_dataset | |
) | |
trainer.train() | |
trainer.save_model() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer, AutoModelForCausalLM | |
tokenizer = AutoTokenizer.from_pretrained('gpt2') | |
model = AutoModelForCausalLM.from_pretrained('./models/gpt2-kasra-2') | |
text = "what's your favorite star trek? i prefer " | |
inputs = tokenizer.encode(text, add_special_tokens=False, return_tensors='pt') | |
prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) | |
# greedy decoding <-- ehh | |
# outputs = model.generate(inputs, max_length=128, do_sample=False, num_beams=1, temperature=0.70, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id) | |
# multinomial sampling <-- sounds semi-real and more random | |
#outputs = model.generate(inputs, max_length=128, do_sample=True, num_beams=1, temperature=0.85, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id) | |
# beam-search decoding <-- i like this one a lot but it feels stale | |
# outputs = model.generate(inputs, max_length=128, do_sample=False, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, pad_token_id=tokenizer.eos_token_id) | |
# beam-search multinomial sampling <-- it sounds real but also feels stale | |
outputs = model.generate(inputs, max_length=128, do_sample=True, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, pad_token_id=tokenizer.eos_token_id) | |
generated = text + tokenizer.decode(outputs[0])[prompt_length:] | |
#generated = tokenizer.decode(outputs[0])[prompt_length+1:] | |
print(generated) | |
# > what's your favorite star trek? i prefer urs cause theres so many good ones hahahaha but i also dont want to go to a movie theater with a bunch of people who arent good enough for me cause theyre not funny enough to me to like, make fun of me for liking them or etc etc, but \_()_/ if you had a list of the best sci-fi/fantasy/horror movies of all time i wouldnt be interested in any of them, you know? idk what theyd be like if they werent on the list and i didnt have the time to |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment