Skip to content

Instantly share code, notes, and snippets.

@Mr0grog
Created October 13, 2023 19:14
Show Gist options
  • Save Mr0grog/ff7c1744a7df182305589e6d1a9a823b to your computer and use it in GitHub Desktop.
Save Mr0grog/ff7c1744a7df182305589e6d1a9a823b to your computer and use it in GitHub Desktop.
BOM Check Benchmark (Python)

Fastest way to check for a BOM in Python

Results:

Testing nesting...
             Baseline (A):: Min: 0.29188, mean: 0.29287
              Nesting (B):: Min: 0.25730, mean: 0.25863
 Nesting w/ Substring (C):: Min: 0.25488, mean: 0.25534
 Nesting w/ each byte (D):: Min: 0.24767, mean: 0.24830

Testing early fail...
             Baseline (A):: Min: 0.29171, mean: 0.29275
 Compare empty buffer (E):: Min: 0.26644, mean: 0.26697
  Check buffer length (F):: Min: 0.23984, mean: 0.24024
             No elses (G):: Min: 0.29146, mean: 0.29253

                      D+F:: Min: 0.20688, mean: 0.20748
import timeit
from statistics import mean
# ALGORITHMS ##################################################################
def get_bom_encoding_a(buffer: bytes):
"""Return the encoding detected from a BOM at the start some bytes."""
if buffer.startswith(b"\xEF\xBB\xBF"):
return 'utf_8_sig'
elif buffer.startswith(b"\xFF\xFE\x00\x00"):
return 'utf_32_le'
elif buffer.startswith(b"\x00\x00\xFE\xFF"):
return 'utf_32_be'
elif buffer.startswith(b"\xFF\xFE"):
return 'utf_16_le'
elif buffer.startswith(b"\xFE\xFF"):
return 'utf_16_be'
else:
return None
def get_bom_encoding_b(buffer: bytes):
"""Return the encoding detected from a BOM at the start some bytes."""
if buffer.startswith(b"\xEF\xBB\xBF"):
return 'utf_8_sig'
elif buffer.startswith(b"\xFF\xFE"):
if buffer.startswith(b"\xFF\xFE\x00\x00"):
return 'utf_32_le'
return 'utf_16_le'
elif buffer.startswith(b"\x00\x00\xFE\xFF"):
return 'utf_32_be'
elif buffer.startswith(b"\xFE\xFF"):
return 'utf_16_be'
else:
return None
def get_bom_encoding_c(buffer: bytes):
"""Return the encoding detected from a BOM at the start some bytes."""
if buffer.startswith(b"\xEF\xBB\xBF"):
return 'utf_8_sig'
elif buffer.startswith(b"\xFF\xFE"):
if buffer[2:4] == b"\x00\x00":
return 'utf_32_le'
return 'utf_16_le'
elif buffer.startswith(b"\x00\x00\xFE\xFF"):
return 'utf_32_be'
elif buffer.startswith(b"\xFE\xFF"):
return 'utf_16_be'
else:
return None
def get_bom_encoding_d(buffer: bytes):
"""Return the encoding detected from a BOM at the start some bytes."""
if buffer.startswith(b"\xEF\xBB\xBF"):
return 'utf_8_sig'
elif buffer.startswith(b"\xFF\xFE"):
if buffer[2] == 0 and buffer[3] == 0:
return 'utf_32_le'
return 'utf_16_le'
elif buffer.startswith(b"\x00\x00\xFE\xFF"):
return 'utf_32_be'
elif buffer.startswith(b"\xFE\xFF"):
return 'utf_16_be'
else:
return None
def get_bom_encoding_e(buffer: bytes):
"""Return the encoding detected from a BOM at the start some bytes."""
if buffer == b'':
return None
elif buffer.startswith(b"\xEF\xBB\xBF"):
return 'utf_8_sig'
elif buffer.startswith(b"\xFF\xFE\x00\x00"):
return 'utf_32_le'
elif buffer.startswith(b"\x00\x00\xFE\xFF"):
return 'utf_32_be'
elif buffer.startswith(b"\xFF\xFE"):
return 'utf_16_le'
elif buffer.startswith(b"\xFE\xFF"):
return 'utf_16_be'
else:
return None
def get_bom_encoding_f(buffer: bytes):
"""Return the encoding detected from a BOM at the start some bytes."""
if len(buffer) < 2:
return None
elif buffer.startswith(b"\xEF\xBB\xBF"):
return 'utf_8_sig'
elif buffer.startswith(b"\xFF\xFE\x00\x00"):
return 'utf_32_le'
elif buffer.startswith(b"\x00\x00\xFE\xFF"):
return 'utf_32_be'
elif buffer.startswith(b"\xFF\xFE"):
return 'utf_16_le'
elif buffer.startswith(b"\xFE\xFF"):
return 'utf_16_be'
else:
return None
def get_bom_encoding_g(buffer: bytes):
"""Return the encoding detected from a BOM at the start some bytes."""
if buffer.startswith(b"\xEF\xBB\xBF"):
return 'utf_8_sig'
if buffer.startswith(b"\xFF\xFE\x00\x00"):
return 'utf_32_le'
if buffer.startswith(b"\x00\x00\xFE\xFF"):
return 'utf_32_be'
if buffer.startswith(b"\xFF\xFE"):
return 'utf_16_le'
if buffer.startswith(b"\xFE\xFF"):
return 'utf_16_be'
return None
def get_bom_encoding_df(buffer: bytes):
"""Return the encoding detected from a BOM at the start some bytes."""
if len(buffer) < 2:
return None
elif buffer.startswith(b"\xEF\xBB\xBF"):
return 'utf_8_sig'
elif buffer.startswith(b"\xFF\xFE"):
if buffer[2] == 0 and buffer[3] == 0:
return 'utf_32_le'
return 'utf_16_le'
elif buffer.startswith(b"\x00\x00\xFE\xFF"):
return 'utf_32_be'
elif buffer.startswith(b"\xFE\xFF"):
return 'utf_16_be'
else:
return None
# BENCHMARK ###################################################################
examples = (
b'',
'a'.encode('ascii'),
'abcdefg'.encode('ascii'),
'abcdefg'.encode('utf_8_sig'),
b'\xFF\xFE' + 'abcdefg'.encode('utf_16_le'),
b'\xFE\xFF' + 'abcdefg'.encode('utf_16_be'),
b'\xFF\xFE\x00\x00' + 'abcdefg'.encode('utf_32_le'),
b'\x00\x00\xFE\xFF' + 'abcdefg'.encode('utf_32_be'),
'abcdefg'.encode('shift_jis'),
'abcdefg'.encode('big5'),
)
def runner(testfunc):
def wrapped():
for raw in examples:
testfunc(raw)
return wrapped
def dotest(label, testfunc):
times = timeit.repeat(runner(testfunc), number=100_000, repeat=10)
print(f'{label:>25}:: Min: {min(times):.5f}, mean: {mean(times):.5f}')
def timeall():
print('Testing nesting...')
dotest('Baseline (A)', get_bom_encoding_a)
dotest('Nesting (B)', get_bom_encoding_b)
dotest('Nesting w/ Substring (C)', get_bom_encoding_c)
dotest('Nesting w/ each byte (D)', get_bom_encoding_d)
print('')
print('Testing early fail...')
dotest('Baseline (A)', get_bom_encoding_a)
dotest('Compare empty buffer (E)', get_bom_encoding_e)
dotest('Check buffer length (F)', get_bom_encoding_f)
dotest('No elses (G)', get_bom_encoding_g)
print('')
dotest('D+F', get_bom_encoding_df)
timeall()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment