Mr0grog/0-README.md

## 0-README.md

      
    Raw
  

              0-README.md
            
          
    Fastest way to check for a BOM in Python

Results:
Testing nesting...
             Baseline (A):: Min: 0.29188, mean: 0.29287
              Nesting (B):: Min: 0.25730, mean: 0.25863
 Nesting w/ Substring (C):: Min: 0.25488, mean: 0.25534
 Nesting w/ each byte (D):: Min: 0.24767, mean: 0.24830

Testing early fail...
             Baseline (A):: Min: 0.29171, mean: 0.29275
 Compare empty buffer (E):: Min: 0.26644, mean: 0.26697
  Check buffer length (F):: Min: 0.23984, mean: 0.24024
             No elses (G):: Min: 0.29146, mean: 0.29253

                      D+F:: Min: 0.20688, mean: 0.20748


## bom-benchmark.py
import timeit
from statistics import mean

# ALGORITHMS ##################################################################
def get_bom_encoding_a(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE\x00\x00"):
        return 'utf_32_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFF\xFE"):
        return 'utf_16_le'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

def get_bom_encoding_b(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE"):
        if buffer.startswith(b"\xFF\xFE\x00\x00"):
            return 'utf_32_le'
        return 'utf_16_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

def get_bom_encoding_c(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE"):
        if buffer[2:4] == b"\x00\x00":
            return 'utf_32_le'
        return 'utf_16_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

def get_bom_encoding_d(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE"):
        if buffer[2] == 0 and buffer[3] == 0:
            return 'utf_32_le'
        return 'utf_16_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

def get_bom_encoding_e(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer == b'':
        return None
    elif buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE\x00\x00"):
        return 'utf_32_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFF\xFE"):
        return 'utf_16_le'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

def get_bom_encoding_f(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if len(buffer) < 2:
        return None
    elif buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE\x00\x00"):
        return 'utf_32_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFF\xFE"):
        return 'utf_16_le'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

def get_bom_encoding_g(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    if buffer.startswith(b"\xFF\xFE\x00\x00"):
        return 'utf_32_le'
    if buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    if buffer.startswith(b"\xFF\xFE"):
        return 'utf_16_le'
    if buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    return None

def get_bom_encoding_df(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if len(buffer) < 2:
        return None
    elif buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE"):
        if buffer[2] == 0 and buffer[3] == 0:
            return 'utf_32_le'
        return 'utf_16_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None


# BENCHMARK ###################################################################
examples = (
    b'',
    'a'.encode('ascii'),
    'abcdefg'.encode('ascii'),
    'abcdefg'.encode('utf_8_sig'),
    b'\xFF\xFE' + 'abcdefg'.encode('utf_16_le'),
    b'\xFE\xFF' + 'abcdefg'.encode('utf_16_be'),
    b'\xFF\xFE\x00\x00' + 'abcdefg'.encode('utf_32_le'),
    b'\x00\x00\xFE\xFF' + 'abcdefg'.encode('utf_32_be'),
    'abcdefg'.encode('shift_jis'),
    'abcdefg'.encode('big5'),
)

def runner(testfunc):
    def wrapped():
        for raw in examples:
            testfunc(raw)
    return wrapped

def dotest(label, testfunc):
    times = timeit.repeat(runner(testfunc), number=100_000, repeat=10)
    print(f'{label:>25}:: Min: {min(times):.5f}, mean: {mean(times):.5f}')

def timeall():
    print('Testing nesting...')
    dotest('Baseline (A)', get_bom_encoding_a)
    dotest('Nesting (B)', get_bom_encoding_b)
    dotest('Nesting w/ Substring (C)', get_bom_encoding_c)
    dotest('Nesting w/ each byte (D)', get_bom_encoding_d)
    print('')
    print('Testing early fail...')
    dotest('Baseline (A)', get_bom_encoding_a)
    dotest('Compare empty buffer (E)', get_bom_encoding_e)
    dotest('Check buffer length (F)', get_bom_encoding_f)
    dotest('No elses (G)', get_bom_encoding_g)
    print('')
    dotest('D+F', get_bom_encoding_df)

timeall()
	import timeit
	from statistics import mean

	# ALGORITHMS ##################################################################
	def get_bom_encoding_a(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFF\xFE"):
	return 'utf_16_le'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_b(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE"):
	if buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	return 'utf_16_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_c(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE"):
	if buffer[2:4] == b"\x00\x00":
	return 'utf_32_le'
	return 'utf_16_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_d(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE"):
	if buffer[2] == 0 and buffer[3] == 0:
	return 'utf_32_le'
	return 'utf_16_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_e(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer == b'':
	return None
	elif buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFF\xFE"):
	return 'utf_16_le'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_f(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if len(buffer) < 2:
	return None
	elif buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFF\xFE"):
	return 'utf_16_le'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_g(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	if buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	if buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	if buffer.startswith(b"\xFF\xFE"):
	return 'utf_16_le'
	if buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	return None

	def get_bom_encoding_df(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if len(buffer) < 2:
	return None
	elif buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE"):
	if buffer[2] == 0 and buffer[3] == 0:
	return 'utf_32_le'
	return 'utf_16_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None


	# BENCHMARK ###################################################################
	examples = (
	b'',
	'a'.encode('ascii'),
	'abcdefg'.encode('ascii'),
	'abcdefg'.encode('utf_8_sig'),
	b'\xFF\xFE' + 'abcdefg'.encode('utf_16_le'),
	b'\xFE\xFF' + 'abcdefg'.encode('utf_16_be'),
	b'\xFF\xFE\x00\x00' + 'abcdefg'.encode('utf_32_le'),
	b'\x00\x00\xFE\xFF' + 'abcdefg'.encode('utf_32_be'),
	'abcdefg'.encode('shift_jis'),
	'abcdefg'.encode('big5'),
	)

	def runner(testfunc):
	def wrapped():
	for raw in examples:
	testfunc(raw)
	return wrapped

	def dotest(label, testfunc):
	times = timeit.repeat(runner(testfunc), number=100_000, repeat=10)
	print(f'{label:>25}:: Min: {min(times):.5f}, mean: {mean(times):.5f}')

	def timeall():
	print('Testing nesting...')
	dotest('Baseline (A)', get_bom_encoding_a)
	dotest('Nesting (B)', get_bom_encoding_b)
	dotest('Nesting w/ Substring (C)', get_bom_encoding_c)
	dotest('Nesting w/ each byte (D)', get_bom_encoding_d)
	print('')
	print('Testing early fail...')
	dotest('Baseline (A)', get_bom_encoding_a)
	dotest('Compare empty buffer (E)', get_bom_encoding_e)
	dotest('Check buffer length (F)', get_bom_encoding_f)
	dotest('No elses (G)', get_bom_encoding_g)
	print('')
	dotest('D+F', get_bom_encoding_df)

	timeall()