Skip to content

Instantly share code, notes, and snippets.

@shihono
Created December 29, 2019 07:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shihono/f1a113ec00e076acbd448f0406976050 to your computer and use it in GitHub Desktop.
Save shihono/f1a113ec00e076acbd448f0406976050 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import re, collections
import curses
from typing import Dict, DefaultDict, List
def get_stats(vocab: Dict) -> DefaultDict:
pairs = collections.defaultdict(int)
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols)-1):
pairs[symbols[i], symbols[i+1]] += freq
return pairs
def merge_vocab(pair: List, v_in: Dict) -> Dict:
v_out = {}
bigram = re.escape(' '.join(pair))
p = re.compile(r'(?<!<\S)' + bigram + r'(?!\S)')
for word in v_in:
w_out = p.sub(''.join(pair), word)
v_out[w_out] = v_in[word]
return v_out
def display_stats(stdscr, vocab, best, strfomat='{:>20}\t'):
for k in sorted(vocab.keys()):
if best in k:
stdscr.addstr(strfomat.format(k), curses.A_BOLD)
else:
stdscr.addstr(strfomat.format(k))
stdscr.addstr(strfomat.format(best)+'\n')
def bpe(stdscr):
vocab = {'l o w </w>': 5, 'l o w e s t </w>': 2,
'n e w e r </w>': 6, 'w i d e r </w>': 3}
num_merges = 10
stdscr.clear()
stdscr_y, stdscr_x = stdscr.getmaxyx()
stdscr.addstr(' '*3)
for k, v in sorted(vocab.items(), key=lambda x:x[0]):
stdscr.addstr('{:>3}{:>17}\t'.format(v, k), curses.A_BOLD)
stdscr.addstr('{:>20}\n'.format('best pair'))
stdscr.refresh()
stdscr.getkey()
for i in range(min(num_merges, stdscr_y-2)):
pairs = get_stats(vocab)
best = max(pairs, key=pairs.get)
vocab = merge_vocab(best, vocab)
best_merged = ''.join(best)
stdscr.addstr('{:>3}'.format(len(pairs)), curses.A_BOLD)
display_stats(stdscr, vocab, best_merged)
stdscr.refresh()
stdscr.getkey()
stdscr.addstr('end\n')
stdscr.refresh()
stdscr.getkey()
def main(stdscr):
bpe(stdscr)
if __name__ == '__main__':
curses.wrapper(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment