Skip to content

Instantly share code, notes, and snippets.

@zwegner

zwegner/utf8_prob.py

Created Jun 30, 2020
Embed
What would you like to do?
Calculate probability of N-byte sequence being part of a valid UTF-8 sequence
from fractions import Fraction
ascii = range(0, 0x80)
cont = range(0x80, 0xC0)
cont_2_0 = range(0xA0, 0xC0)
cont_2_2 = range(0x80, 0xA0)
cont_3_0 = range(0x90, 0xC0)
cont_3_2 = range(0x80, 0x90)
leader2 = range(0xC2, 0xE0)
leader3_0 = range(0xE0, 0xE1)
leader3_1 = [*range(0xE1, 0xED), *range(0xEE, 0xF0)]
leader3_2 = range(0xED, 0xEE)
leader4_0 = range(0xF0, 0xF1)
leader4_1 = range(0xF1, 0xF4)
leader4_2 = range(0xF4, 0xF5)
[ANY, NEUTRAL, ONE_CONT, TWO_CONT, LEADER_3_0, LEADER_3_1, LEADER_3_2,
LEADER_4_0, LEADER_4_1, LEADER_4_2, N_STATES, *_] = range(100)
cache = {}
def search(depth, state):
if depth == 0:
return 1
key = (depth, state)
if key in cache:
return cache[key]
ranges = []
# Any: at the beginning of the sequence, we can be in any state
if state == ANY:
if depth > 1:
if depth > 2:
ranges.append((cont, TWO_CONT))
ranges.append((cont, ONE_CONT))
ranges.append((cont, NEUTRAL))
state = NEUTRAL
# FALLTHROUGH kinda
# Neutral: not inside any sequence
if state == NEUTRAL:
ranges.append((ascii, NEUTRAL))
ranges.append((leader2, ONE_CONT))
ranges.append((leader3_0, LEADER_3_0))
ranges.append((leader3_1, LEADER_3_1))
ranges.append((leader3_2, LEADER_3_2))
ranges.append((leader4_0, LEADER_4_0))
ranges.append((leader4_1, LEADER_4_1))
ranges.append((leader4_2, LEADER_4_2))
# One/two cont: need one or two continuation bytes
elif state == ONE_CONT:
ranges.append((cont, NEUTRAL))
elif state == TWO_CONT:
ranges.append((cont, ONE_CONT))
# First continuation byte after 3-byte leader
elif state == LEADER_3_0:
ranges.append((cont_2_0, ONE_CONT))
elif state == LEADER_3_1:
ranges.append((cont, ONE_CONT))
elif state == LEADER_3_2:
ranges.append((cont_2_2, ONE_CONT))
# First continuation byte after 4-byte leader
elif state == LEADER_4_0:
ranges.append((cont_3_0, TWO_CONT))
elif state == LEADER_4_1:
ranges.append((cont, TWO_CONT))
elif state == LEADER_4_2:
ranges.append((cont_3_2, TWO_CONT))
# For each byte range and next state, recurse
legal = 0
for [r, s] in ranges:
legal += len(r) * search(depth - 1, s)
cache[key] = legal
return legal
for d in range(1, 9):
legal = search(d, ANY)
total = 1<<(8*d)
f = Fraction(total-legal, total)
print(d, total, legal, f, float(f))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.