Skip to content

Instantly share code, notes, and snippets.

@pythonlessons
Created September 4, 2023 15:04
Show Gist options
  • Save pythonlessons/b8815bb6548317e526d34fb3071e7a15 to your computer and use it in GitHub Desktop.
Save pythonlessons/b8815bb6548317e526d34fb3071e7a15 to your computer and use it in GitHub Desktop.
transformers_training
def read_files(path):
with open(path, "r", encoding="utf-8") as f:
en_train_dataset = f.read().split("\n")[:-1]
return en_train_dataset
en_training_data = read_files(en_training_data_path)
en_validation_data = read_files(en_validation_data_path)
es_training_data = read_files(es_training_data_path)
es_validation_data = read_files(es_validation_data_path)
# Consider only sentences with length <= 500
max_lenght = 500
train_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_training_data, en_training_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
val_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
es_training_data, en_training_data = zip(*train_dataset)
es_validation_data, en_validation_data = zip(*val_dataset)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment