ymoslem/subword_source_only.py

## subword_source_only.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Subwording the source file only
# Command: python3 subword.py <source_model_file> <source_pred_file>
# Note: If you did not train the model with start and end tokens remove ['<s>'] and ['</s>'] from line #30


import sys
import sentencepiece as spm


source_model = sys.argv[1]
source_raw = sys.argv[2]
source_subworded = source_raw + ".subword"

print("Source Model:", source_model)
print("Source Dataset:", source_raw)


sp = spm.SentencePieceProcessor()


# Subwording the train source

sp.load(source_model)

with open(source_raw) as source, open(source_subworded, "w+") as source_subword:
    for line in source:
        line = ['<s>'] + sp.encode_as_pieces(line) + ['</s>']    # encode and add start & end tokens
        line = " ".join([token for token in line])
        source_subword.write(line + "\n")

print("Done subwording the source file! Output:", source_subworded)
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	# Subwording the source file only
	# Command: python3 subword.py <source_model_file> <source_pred_file>
	# Note: If you did not train the model with start and end tokens remove ['<s>'] and ['</s>'] from line #30


	import sys
	import sentencepiece as spm


	source_model = sys.argv[1]
	source_raw = sys.argv[2]
	source_subworded = source_raw + ".subword"

	print("Source Model:", source_model)
	print("Source Dataset:", source_raw)


	sp = spm.SentencePieceProcessor()


	# Subwording the train source

	sp.load(source_model)

	with open(source_raw) as source, open(source_subworded, "w+") as source_subword:
	for line in source:
	line = ['<s>'] + sp.encode_as_pieces(line) + ['</s>'] # encode and add start & end tokens
	line = " ".join([token for token in line])
	source_subword.write(line + "\n")

	print("Done subwording the source file! Output:", source_subworded)