zacharysyoung/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Better way to prepend BOM

I wanted to compare solutions from JonSG and chepner to see if any ran particularly faster (particularly to see if chepner's ran faster), and to see if they only add the BOM (and don't mutate the text along the way).
Both failed, but for different reasons; JonSG's can easily be fixed.
My comparator:

runs and times both functions against a 10MB UTF-8 encoded file of random text that runs the full spectrum of Unicode, minus invalid UTF-16 surrogate pairs
reads the output and asserts the output has a BOM; also chomps the BOM leaving what should be the original UTF-8 bytes
prints results

def compare():
    import time

    for name_out, func in [
        ("output-stream.txt", convert_stream),  # JonSG
        ("output-copy.txt", convert_copy),      # chepner
    ]:
        beg = time.monotonic()
        func(FNAME_TXT, name_out)
        delta = time.monotonic() - beg

        with open(FNAME_TXT, "rb") as f:
            input = f.read()

        with open(name_out, "rb") as f:
            first_three = f.read(3)
            assert first_three == b"\xEF\xBB\xBF", f"first three bytes of '{FNAME_TXT}'={first_three}; want BOM (b'\\xEF\\xBB\\xBF')"  # fmt: skip
            output_sans_bom = f.read()

        print(
            f"{func.__name__} ran in {delta:.4f} s; output==input = {output_sans_bom==input}"
        )
JonSG's ran 0.03 seconds, but mutated the text along the way:
convert_stream ran in 0.0363 s; output==input = False

Adding newline='' to the opener for the input file fixes that by stopping the reader from normalizing line endings:
...
    with open(path_in, "r", encoding="utf-8", newline="") as f_in:
        ...
convert_stream ran in 0.0340 s; output==input = True

chepner's fails with some exception in copyfileobj:
Traceback (most recent call last):
  File "/Users/zyoung/develop/StackOverflow/main.py", line 100, in <module>
    main()
  File "/Users/zyoung/develop/StackOverflow/main.py", line 20, in main
    compare()
  File "/Users/zyoung/develop/StackOverflow/main.py", line 33, in compare
    func(FNAME_TXT, name_out)
  File "/Users/zyoung/develop/StackOverflow/main.py", line 64, in convert_copy
    copyfileobj(chain(codecs.BOM_UTF8, in_), out)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/shutil.py", line 201, in copyfileobj
    fsrc_read = fsrc.read
                ^^^^^^^^^
AttributeError: 'itertools.chain' object has no attribute 'read'

I don't know itertools and/or chaining very well, so I cannot say what's wrong or what the fix might be.
The complete script that generates and runs the comparison can be found below.

  
## main.py
import sys

import codecs
from itertools import chain
from shutil import copyfileobj


def main():
    def usage():
        print("usage: main.py gen|compare", file=sys.stderr)
        exit(1)

    if len(sys.argv) < 2:
        usage()

    cmd = sys.argv[1]
    if cmd == "gen":
        gen_random_txt(tgt_size=10 * MB)
    elif cmd == "compare":
        compare()
    else:
        usage()


def compare():
    import time

    for name_out, func in [
        ("output-stream.txt", convert_stream),  # JonSG
        ("output-copy.txt", convert_copy),  # chepner
    ]:
        beg = time.monotonic()
        func(FNAME_TXT, name_out)
        delta = time.monotonic() - beg

        with open(FNAME_TXT, "rb") as f:
            input = f.read()

        with open(name_out, "rb") as f:
            first_three = f.read(3)
            assert first_three == b"\xEF\xBB\xBF", f"first three bytes of '{FNAME_TXT}'={first_three}; want BOM (b'\\xEF\\xBB\\xBF')"  # fmt: skip
            output_sans_bom = f.read()

        print(
            f"{func.__name__} ran in {delta:.4f} s; output==input = {output_sans_bom==input}"
        )


def convert_stream(path_in: str, path_out: str):
    """
    Convert a CSV file format to UTF-8-BOM, by streaming
    """
    with open(path_in, "r", encoding="utf-8") as f_in:
        with open(path_out, "w", encoding="utf-8-sig", newline="") as f_out:
            for row in f_in:
                f_out.write(row)


def convert_copy(path_in: str, path_out: str):
    """
    Convert a CSV file format to UTF-8-BOM, by copying
    """
    with open(path_out, "wb") as out, open(path_in, "rb") as in_:
        copyfileobj(chain(codecs.BOM_UTF8, in_), out)


MB = 1024 * 1024
FNAME_TXT = "input-rand.txt"


def gen_random_txt(tgt_size: int = 1 * MB):
    import os
    import random

    get_int = lambda: random.randint(0, 1_112_063)

    with open(FNAME_TXT, "w", encoding="utf-8") as w:
        size = 0
        while size < tgt_size:
            x = get_int()
            # skip utf-16 surrogate halves, https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling
            if x >= 0xD800 and x <= 0xDFFF:
                continue

            w.write(chr(x))

            if x < 0x007F:
                size += 1
            elif x < 0x07FF:
                size += 2
            elif x < 0xFFFF:
                size += 3
            else:
                size += 4

    x = os.stat(FNAME_TXT)
    print(f"generated random txt w/size {x.st_size}; was aiming for {tgt_size}")


main()
	import sys

	import codecs
	from itertools import chain
	from shutil import copyfileobj


	def main():
	def usage():
	print("usage: main.py gen\|compare", file=sys.stderr)
	exit(1)

	if len(sys.argv) < 2:
	usage()

	cmd = sys.argv[1]
	if cmd == "gen":
	gen_random_txt(tgt_size=10 * MB)
	elif cmd == "compare":
	compare()
	else:
	usage()


	def compare():
	import time

	for name_out, func in [
	("output-stream.txt", convert_stream), # JonSG
	("output-copy.txt", convert_copy), # chepner
	]:
	beg = time.monotonic()
	func(FNAME_TXT, name_out)
	delta = time.monotonic() - beg

	with open(FNAME_TXT, "rb") as f:
	input = f.read()

	with open(name_out, "rb") as f:
	first_three = f.read(3)
	assert first_three == b"\xEF\xBB\xBF", f"first three bytes of '{FNAME_TXT}'={first_three}; want BOM (b'\\xEF\\xBB\\xBF')" # fmt: skip
	output_sans_bom = f.read()

	print(
	f"{func.__name__} ran in {delta:.4f} s; output==input = {output_sans_bom==input}"
	)


	def convert_stream(path_in: str, path_out: str):
	"""
	Convert a CSV file format to UTF-8-BOM, by streaming
	"""
	with open(path_in, "r", encoding="utf-8") as f_in:
	with open(path_out, "w", encoding="utf-8-sig", newline="") as f_out:
	for row in f_in:
	f_out.write(row)


	def convert_copy(path_in: str, path_out: str):
	"""
	Convert a CSV file format to UTF-8-BOM, by copying
	"""
	with open(path_out, "wb") as out, open(path_in, "rb") as in_:
	copyfileobj(chain(codecs.BOM_UTF8, in_), out)


	MB = 1024 * 1024
	FNAME_TXT = "input-rand.txt"


	def gen_random_txt(tgt_size: int = 1 * MB):
	import os
	import random

	get_int = lambda: random.randint(0, 1_112_063)

	with open(FNAME_TXT, "w", encoding="utf-8") as w:
	size = 0
	while size < tgt_size:
	x = get_int()
	# skip utf-16 surrogate halves, https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling
	if x >= 0xD800 and x <= 0xDFFF:
	continue

	w.write(chr(x))

	if x < 0x007F:
	size += 1
	elif x < 0x07FF:
	size += 2
	elif x < 0xFFFF:
	size += 3
	else:
	size += 4

	x = os.stat(FNAME_TXT)
	print(f"generated random txt w/size {x.st_size}; was aiming for {tgt_size}")


	main()