ymoslem/M2M-100-example.py

## M2M-100-example.py
# This example uses M2M-100 models converted to the CTranslate2 format.
# Download CTranslate2 models:
# • M2M-100 418M-parameter model: https://bit.ly/33fM1AO
# • M2M-100 1.2B-parameter model: https://bit.ly/3GYiaed


import ctranslate2
import sentencepiece as spm


# [Modify] Set file paths of the source and target
source_file_path = "source_test.en"
target_file_path = "target_test.ja.mt"

# [Modify] Set paths to the CTranslate2 and SentencePiece models
ct_model_path = "m2m100_ct2/"
sp_model_path = "m2m100_ct2/sentencepiece.model"

# [Modify] Set language prefixes of the source and target
src_prefix = "__en__"
tgt_prefix = "__ja__"

# [Modify] Set the device and beam size
device = "cpu"  # or "cuda" for GPU
beam_size = 5


# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

# Open the source file
with open(source_file_path, "r") as source:
  lines = source.readlines()

source_sents = [line.strip() for line in lines]
target_prefix = [[tgt_prefix]] * len(source_sents)

# Subword the source sentences
source_sents_subworded = sp.encode(source_sents, out_type=str)
source_sents_subworded = [[src_prefix] + sent for sent in source_sents_subworded]
print("First sentence:", source_sents_subworded[0])

# Translate the source sentences
translator = ctranslate2.Translator(ct_model_path, device=device)
translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=beam_size, target_prefix=target_prefix)
translations = [translation[0]['tokens'] for translation in translations]

# Desubword the target sentences
translations_desubword = sp.decode(translations)
translations_desubword = [sent[len(tgt_prefix):] for sent in translations_desubword]
print("First translation:", translations_desubword[0])

# Save the translations to the a file
with open(target_file_path, "w+", encoding="utf-8") as target:
  for line in translations_desubword:
    target.write(line.strip() + "\n")

print("Done! Target file saved at:", target_file_path)
	# This example uses M2M-100 models converted to the CTranslate2 format.
	# Download CTranslate2 models:
	# • M2M-100 418M-parameter model: https://bit.ly/33fM1AO
	# • M2M-100 1.2B-parameter model: https://bit.ly/3GYiaed


	import ctranslate2
	import sentencepiece as spm


	# [Modify] Set file paths of the source and target
	source_file_path = "source_test.en"
	target_file_path = "target_test.ja.mt"

	# [Modify] Set paths to the CTranslate2 and SentencePiece models
	ct_model_path = "m2m100_ct2/"
	sp_model_path = "m2m100_ct2/sentencepiece.model"

	# [Modify] Set language prefixes of the source and target
	src_prefix = "__en__"
	tgt_prefix = "__ja__"

	# [Modify] Set the device and beam size
	device = "cpu" # or "cuda" for GPU
	beam_size = 5


	# Load the source SentecePiece model
	sp = spm.SentencePieceProcessor()
	sp.load(sp_model_path)

	# Open the source file
	with open(source_file_path, "r") as source:
	lines = source.readlines()

	source_sents = [line.strip() for line in lines]
	target_prefix = [[tgt_prefix]] * len(source_sents)

	# Subword the source sentences
	source_sents_subworded = sp.encode(source_sents, out_type=str)
	source_sents_subworded = [[src_prefix] + sent for sent in source_sents_subworded]
	print("First sentence:", source_sents_subworded[0])

	# Translate the source sentences
	translator = ctranslate2.Translator(ct_model_path, device=device)
	translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=beam_size, target_prefix=target_prefix)
	translations = [translation[0]['tokens'] for translation in translations]

	# Desubword the target sentences
	translations_desubword = sp.decode(translations)
	translations_desubword = [sent[len(tgt_prefix):] for sent in translations_desubword]
	print("First translation:", translations_desubword[0])

	# Save the translations to the a file
	with open(target_file_path, "w+", encoding="utf-8") as target:
	for line in translations_desubword:
	target.write(line.strip() + "\n")

	print("Done! Target file saved at:", target_file_path)