def load_dataset(train_path,test_path,tokenizer): | |
train_dataset = TextDataset( | |
tokenizer=tokenizer, | |
file_path=train_path, | |
block_size=128) | |
test_dataset = TextDataset( | |
tokenizer=tokenizer, | |
file_path=test_path, | |
block_size=128) | |
data_collator = DataCollatorForLanguageModeling( | |
tokenizer=tokenizer, mlm=False, | |
) | |
return train_dataset,test_dataset,data_collator | |
train_dataset,test_dataset,data_collator = load_dataset(train_mod_path,test_mod_path,tokenizer) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment