Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
tokenizer = AutoTokenizer.from_pretrained('gpt2')
MAX_LEN = 100
train_m = ""
with open(train_path, "r", encoding='utf-8') as f:
for line in f.readlines():
if len(line.split())>MAX_LEN:
continue
train_m += (tokenizer.special_tokens_map['bos_token']+line.rstrip()+tokenizer.special_tokens_map['eos_token'])
with open(train_mod_path, "w", encoding='utf-8') as f:
f.write(train_m)
test_m = ""
with open(test_path, "r", encoding='utf-8') as f:
for line in f.readlines():
if len(line.split())>MAX_LEN:
continue
test_m += (tokenizer.special_tokens_map['bos_token']+line.rstrip()+tokenizer.special_tokens_map['eos_token'])
with open(test_mod_path, "w", encoding='utf-8') as f:
f.write(test_m)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.