aravindpai/bpe_5.py

## bpe_5.py
#merges the most frequent pair in the corpus
#accepts the corpus and best pair
#returns the modified corpus
import re
def merge_vocab(pair, corpus_in):
    corpus_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')

    for word in corpus_in:
        w_out = p.sub(''.join(pair), word)
        corpus_out[w_out] = corpus_in[word]

    return corpus_out
	#merges the most frequent pair in the corpus
	#accepts the corpus and best pair
	#returns the modified corpus
	import re
	def merge_vocab(pair, corpus_in):
	corpus_out = {}
	bigram = re.escape(' '.join(pair))
	p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')

	for word in corpus_in:
	w_out = p.sub(''.join(pair), word)
	corpus_out[w_out] = corpus_in[word]

	return corpus_out