Skip to content

Instantly share code, notes, and snippets.

@almost
Last active June 2, 2023 08:56
Show Gist options
  • Save almost/5e3de8e0300df203ed34ce20ac6ba042 to your computer and use it in GitHub Desktop.
Save almost/5e3de8e0300df203ed34ce20ac6ba042 to your computer and use it in GitHub Desktop.
import re,sys
from typing import Iterable, List
CHARACTERS = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ:!,.'() ")
END = "___END___"
class Toastkenizer:
def __init__(self, characters:List[str]=CHARACTERS, tokens:List[str] = []):
self.characters = characters
self.tokens_trie = {}
self.tokens_list = []
self.tokens_lookup = {}
self.deleted = []
for token in (tokens or characters):
self.add_token(token)
def add_token(self, token_str:str):
if self.deleted:
index = self.deleted.pop()
else:
index = len(self.tokens_list)
self.tokens_list.append(None)
self.tokens_list[index] = token_str
d = self.tokens_trie
so_far = []
for c in token_str:
so_far = d.get(END, []) + [self.characters.index(c)]
if c not in d:
d[c] = {
END: so_far
}
d = d[c]
d[END] = [index]
self.tokens_lookup[token_str] = index
def normalize(self, s:str) -> Iterable[str]:
s = re.sub(r"\s+", " ", s).rstrip()
for c in s:
c = c.upper() # Who needs lower-case?
if c in self.characters:
yield c
def tokenize(self, s:str) -> Iterable[int]:
node = self.tokens_trie
debug = ""
for c in self.normalize(s):
if c not in node:
if END in node:
debug = ""
yield from node[END]
else:
print("missing", debug)
node = self.tokens_trie
node = node[c]
if END in node:
yield from node[END]
if __name__ == '__main__':
tokenizer = Toastkenizer(tokens=[x[:-1] for x in open("tokens.txt").readlines()])
# data = open(sys.argv[1], "rb").read().decode("latin-1")
data = "Would you like any toast"
for line in data.split("\n"):
print(
" ".join(
f"[{token}-{tokenizer.tokens_list[token]}]"
for word in re.split(r"\b", line)
for token in tokenizer.tokenize(" " + word)
))
import os, re
from toastkenizer import Toastkenizer
MAX_TOKENS = 1024
INPUT = "inputdata"
def readfromdir(d):
for file in os.listdir(d):
yield(file, open(os.path.join(d, file), "rb").read().decode("latin-1"))
tokenizer = Toastkenizer()
while len(tokenizer.tokens_list) < MAX_TOKENS:
pair_counts = {}
total_counts = {}
max_found = None
for filename, data in readfromdir(INPUT):
for line in data.split("\n"):
weight = 100 if "TOAST" in line.upper() else 1
for word in re.split(r"\b", line):
# print("".join(tokenizer.normalize(line)))
tokens = iter(tokenizer.tokenize(" " + word))
prev_token = next(tokens, None)
if not prev_token:
continue
total_counts[prev_token] = total_counts.get(prev_token, 0) + 1
for token in tokens:
total_counts[token] = total_counts.get(token, 0) + 1
# Make a new token from this pair
new_token = tokenizer.tokens_list[prev_token] + tokenizer.tokens_list[token]
# Check if it already exists
if new_token not in tokenizer.tokens_lookup:
pair_counts[new_token] = pair_counts.get(new_token, 0) + weight
if max_found is None or pair_counts[new_token] > max_found[1]:
max_found = (new_token, pair_counts[new_token])
prev_token = token
assert max_found is not None
(new_token, count) = max_found
tokenizer.add_token(new_token)
print(f"Found next best token: {new_token.replace(' ', '_')} [{tokenizer.tokens_lookup[new_token]}] (seen {count} times) ")
open("tokens.txt", "w").writelines(x+"\n" for x in tokenizer.tokens_list)
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
:
!
,
.
'
(
)
T
ER
A
ST
.
I
TH
S
,
W
OU
TO
IN
H
L
:
THE
'
O
M
C
RE
STER
B
EN
AT
AN
Y
D
ING
LI
TOA
YOU
ON
AN
LL
RI
P
F
G
N
ES
LISTER
K
IN
AR
TOASTER
OR
!
AND
ED
OF
IS
IT
WH
IT
MM
RI
R
VE
RY
OM
E
OT
OW
ON
MMER
UT
RIMMER
ET
HE
TEN
U
WE
LE
KRY
IS
IC
TOAS
KRYTEN
TE
RE
GH
AL
LY
AC
AS
BE
ST
ID
THAT
ME
AY
ION
IR
HA
J
SE
KE
WHAT
OO
LD
GHT
IM
SH
CH
QU
ND
CAT
CA
ANT
AD
(
NO
DO
UST
HO
MY
ITH
VER
OME
TH
WITH
GO
UR
QU
TER
SOME
SO
THIS
RO
HIS
INT
SE
HAVE
ANY
JUST
FOR
LIKE
ONE
CE
BUT
EA
EST
YOUR
NOT
CH
OULD
UP
US
ARE
)
AS
IE
EX
PP
CT
WAS
WA
NT
..
DON
HI
OOK
RA
LLY
ALL
OUT
CAN
NG
WANT
OD
NK
ENT
ME
WELL
...
ACK
BLE
LOOK
AKE
VE
SP
KN
THERE
LO
OUT
AT
EL
CK
AR
AB
WN
SS
THI
ILL
IL
THEY
IVE
ANS
HOLLY
SIR
SI
ESTION
QUESTION
UL
ORE
YEA
AIN
AI
NE
IGHT
STAR
HOW
FF
AL
KNOW
LO
V
FRO
GET
RU
TS
BU
WHO
HIM
WOULD
WO
ERS
ABOUT
LL
ONE
KI
SU
AND
PS
OCH
KOCH
KOCHANS
IF
KOCHANSKI
INTO
: (
PRO
PR
AG
OH
IND
THING
BACK
BA
NS
FROM
NE
RS
TAL
TA
HERE
ARD
THEN
PO
MP
TIM
DE
KING
FO
EVER
GOING
MAN
MA
IG
DID
DI
PE
SC
INE
GOT
TING
TI
OVER
CO
OUND
AM
HOL
UND
UN
TIME
DE
ALLY
ITE
UN
NOW
WI
ACE
TW
OFF
BY
SEE
PL
KS
YEAH
UG
.)
AST
YE
OUS
WHY
OR
SAY
SA
EE
REE
DOWN
THINK
AG
STA
RED
UM
WAY
WA
RD
HER
HE
MEAN
NING
TY
NI
OK
FE
UGH
THRO
LF
GE
. (
LET
LE
THA
ATION
NA
ALL
EVERY
ICE
URE
UFF
LIGHT
IDE
GE
COR
IP
US
JO
DIS
FORE
TLE
RIGHT
AM
BRE
BR
COME
COM
THR
HEAD
RO
THEM
AVE
MORE
MO
YES
BO
AC
KIE
TAKE
PRES
STARB
AB
ATI
TALKIE
STARBUG
ELL
ENT
EN
AK
MENT
GIVE
WHEN
TWO
X
COMP
SOMETHING
THREE
PO
AROUND
TUR
RED
RESS
HAS
THOU
FR
AME
ONLY
USE
OL
SES
BEEN
ULD
SHE
FT
HING
FIR
FI
COULD
RSE
COU
VERY
HAPP
TTLE
TT
BL
TEA
TE
RM
HAD
THER
HEA
REP
GU
HAND
VING
VI
WHE
APP
HER
POIN
HI
CRA
CR
DWA
OP
DOES
AD
NER
LIFE
ICH
CIDE
CI
CLO
CL
YEARS
THIN
CON
CO
RT
OKAY
BER
BE
SME
THOUGHT
GI
THROUGH
AGAIN
EP
RF
CTION
DWARF
EW
TAIN
TA
NCE
FOUR
WAL
LS
TTER
RRY
RR
SM
THING
ROOM
OWN
WHERE
ANGE
DY
UP
CA
DS
TRA
TR
YING
IES
WHICH
PRE
FL
UNDE
HAPPEN
DIC
DI
TI
HOT
COURSE
TALK
GOOD
IMP
OG
WAY
BREAK
TED
KES
SHOT
SHO
WS
ATING
UE
SPACE
LITTLE
TOO
WHI
TION
REPAI
AV
DES
CUT
CU
REALLY
REA
MAY
FUL
DAY
DA
BEFORE
THAN
DRI
DR
OID
SW
WERE
OTHER
LIS
LI
SING
OSE
OS
MUFF
MU
EXT
DING
POINT
MUFFIN
WILL
STOP
SECON
ACE
VES
PAR
PA
EV
WER
ROUND
ACT
SHIP
LAST
LA
HEY
SOMEONE
ANYTHING
MIGHT
MI
LES
SH
THEIR
SIM
IBLE
LING
NEVER
WARD
PRESENT
SMEG
GRE
GR
UNIVE
FIRST
FEE
FE
FACE
FA
USE
SELF
LEFT
LOVE
ANYONE
ATE
ISE
EY
WOR
BETTER
GOD
EXP
ECT
DIDN
NOTHING
ITY
OTHER
SPE
UNIVERSE
DOOR
ORY
ENTER
JOH
CONT
CON
OU
RID
OPE
ARLY
MACH
TERS
AGE
JOHN
MEN
KED
HUND
HU
MAN
MA
MS
HUNDRED
TCH
MIND
MIN
SIDE
SI
EXIS
SOUND
BB
INF
MADE
BECA
AWAY
NY
PUT
PU
LLS
MACHINE
SIMUL
STAND
RAI
RA
CIDENT
BECAUSE
ICK
CAP
SION
LOOKS
MAKE
CORRID
TELL
DOESN
CORRIDOR
OI
RGE
XT
MODE
CKING
PER
PE
SON
SO
WAYS
GIVEN
ACCIDENT
OUR
IAL
IA
UEL
VIE
VI
WAIT
NC
NAME
NA
TURE
TU
APPEA
HAN
HA
NNA
NN
TERED
BIT
BI
TON
TO
STILL
SHOULD
XX
CHANGE
CHA
COME
RNING
SIMULANT
ALWAYS
NEW
RG
WANTS
GS
ADD
DES
STRA
STR
SED
ADDRESS
NEXT
ILLE
OTS
LOOKING
EG
INITE
CAM
EC
CAMILLE
EXISTEN
HARD
FIVE
QUE
EXISTENCE
JU
ENCE
AF
DAVE
PLAN
PLA
RN
BAB
BA
THEREFORE
WALKS
OARD
YS
LIE
CTI
LIS
LEAVE
LEA
ANOTHER
REPAIRED
WHOA
CRUEL
COCK
PIT
PI
NICE
NI
THO
SECOND
ISTER
RUN
MODEL
STARTS
AFT
DDE
DD
CKS
GONNA
INFINITE
RRANT
MINUT
AH
OB
EY
MAYBE
MBER
LONG
BUTTERED
STO
CES
INS
LENA
LENAHAN
DEAD
DEA
AFTER
STUFF
QUAR
IDE
SY
QUITE
:
HEAR
DOING
DU
GAME
GA
PROBAB
HOUR
NOL
NO
HAPPENED
LEEP
LIGHT
COCKPIT
MB
VIEW
CAPTAIN
PROBABLY
SER
NEED
BUTT
REEN
OLD
AP
MECH
BEG
EAT
EA
RLY
SUPP
IO
PER
BREA
QUEEG
DLY
BEING
,
STU
TABLE
LISTER
SOU
PICK
PI
LLOW
ELE
EL
MUST
ONS
WHOLE
TOP
CUL
CU
HEL
SENS
SEN
SORT
CALL
MECHAN
.
OWN
UNDERS
UNDER
OPLE
ASK
PEOPLE
CAL
EXPLO
STOPS
ABLE
FUT
FU
GL
EVERYONE
BEH
ARNOL
ARNOLD
ATOR
PLEA
BEHIND
TAND
TAN
SLEEP
SL
SCREEN
SCR
MY
UNDERSTAND
QUARTERS
OGRA
CLEA
SPEE
SPE
SHA
CTIVE
SOR
ISN
TAKES
NEY
DAYS
THANKS
FO
NED
CHEE
CHE
ATED
DDEN
BALL
FUTURE
MUCH
ALSO
MAK
BOARD
BO
TURNS
TURN
SIDE
THOUS
THOUSAND
MAKING
SAID
MR
SHES
UNK
SORRY
THOSE
DU
RING
BIG
TTE
FULLY
ITS
AH
HIGH
HUMAN
ENG
CER
CE
KIND
KI
KILL
CREW
CRE
ELECT
LIVE
WEAR
LIGHTS
GENT
GEN
HOPE
FIND
FIN
NIGHT
SUDDEN
ELSE
WEL
BELIE
DAV
UNI
START
FIC
FI
WALK
THINKING
WANTED
PED
WE
IM
WRO
WR
RL
CHES
ACC
DROID
! (
SAYS
ROOM
QUA
COMPUT
WARDS
LUT
SINGING
LICA
TANT
PY
SEC
OFFICE
CY
FLOO
EVERYTHING
APPEARS
CORNER
Z
PRESSES
KEY
KE
SCEN
CERTAIN
SUDDENLY
TRYING
TAL
WELCOME
THROWS
INTER
INTE
GENTLE
HH
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment