Skip to content

Instantly share code, notes, and snippets.

@jojonki
Created June 17, 2020 08:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jojonki/ec4520ba706ab1ad7e1c8fb694da024b to your computer and use it in GitHub Desktop.
Save jojonki/ec4520ba706ab1ad7e1c8fb694da024b to your computer and use it in GitHub Desktop.
BPE (Neural Machine Translation of Rare Words with Subword Units, Rico Sennrich.)
import collections
import re
def get_stats(vocab):
pairs = collections.defaultdict(int)
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols)-1):
pairs[symbols[i],symbols[i+1]] += freq
return pairs
def merge_vocab(pair, v_in):
v_out = {}
bigram = re.escape(' '.join(pair))
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
for word in v_in:
w_out = p.sub(''.join(pair), word)
v_out[w_out] = v_in[word]
return v_out
vocab={'l o w </w>':5,'l o w e r </w>':2,'n e w e s t </w>':6,'w i d e s t </w>':3}
num_merges = 10
for i in range(num_merges):
pairs = get_stats(vocab)
best = max(pairs, key=pairs.get)
vocab = merge_vocab(best, vocab)
print(best)
print('vocab')
print(vocab)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment