Skip to content

Instantly share code, notes, and snippets.

@g-i-o-r-g-i-o
Forked from miladfa7/gpt2_from_scratch.py
Created September 8, 2022 12:55
Show Gist options
  • Save g-i-o-r-g-i-o/d9d10e7ddbad1cb6f52b79e00396d851 to your computer and use it in GitHub Desktop.
Save g-i-o-r-g-i-o/d9d10e7ddbad1cb6f52b79e00396d851 to your computer and use it in GitHub Desktop.
Train GPT-2 from Scratch on your own language(Persain) | GPT-2 Training on non-english text
from simpletransformers.language_modeling import LanguageModelingModel
import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
train_args = {
"reprocess_input_data": True,
"overwrite_output_dir": True,
"num_train_epochs": 5,
"save_eval_checkpoints": True,
"block_size": 509,
"max_seq_length": 509,
# "save_model_every_epoch": False,
"learning_rate": 1e-4,
"train_batch_size": 4,
"gradient_accumulation_steps": 4,
"mlm": False,
"dataset_type": "simple",
"logging_steps": 100,
"evaluate_during_training": True,
"evaluate_during_training_steps": 10000,
"evaluate_during_training_verbose": True,
"use_cached_eval_features": True,
"sliding_window": True,
"use_multiprocessing": True,
"vocab_size": 100000,
"output_dir": f"outputs/from_scratch",
"best_model_dir": f"outputs/from_scratch/best_model"
}
#https://github.com/miladfa7/Persian-Wikipedia-Dataset
train_file = f"Persian-WikiText-all.txt"
test_file = f"test.txt"
model = LanguageModelingModel(
"gpt2",
None,# if None then training from scratch
# /path/to/pretrain_model/ if set path then fine-tune model
args=train_args,
train_files=train_file,
)
# model.train_tokenizer(train_file)
model.train_model(
train_file,
eval_file=test_file,
)
model.eval_model(test_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment