Skip to content

Instantly share code, notes, and snippets.

@zacharysyoung
Last active February 26, 2024 19:10
Show Gist options
  • Save zacharysyoung/717961ba5f325170faedf9aaf94cdd88 to your computer and use it in GitHub Desktop.
Save zacharysyoung/717961ba5f325170faedf9aaf94cdd88 to your computer and use it in GitHub Desktop.
SO-78062176

I wanted to compare solutions from JonSG and chepner to see if any ran particularly faster (particularly to see if chepner's ran faster), and to see if they only add the BOM (and don't mutate the text along the way).

Both failed, but for different reasons; JonSG's can easily be fixed.

My comparator:

  1. runs and times both functions against a 10MB UTF-8 encoded file of random text that runs the full spectrum of Unicode, minus invalid UTF-16 surrogate pairs
  2. reads the output and asserts the output has a BOM; also chomps the BOM leaving what should be the original UTF-8 bytes
  3. prints results
def compare():
    import time

    for name_out, func in [
        ("output-stream.txt", convert_stream),  # JonSG
        ("output-copy.txt", convert_copy),      # chepner
    ]:
        beg = time.monotonic()
        func(FNAME_TXT, name_out)
        delta = time.monotonic() - beg

        with open(FNAME_TXT, "rb") as f:
            input = f.read()

        with open(name_out, "rb") as f:
            first_three = f.read(3)
            assert first_three == b"\xEF\xBB\xBF", f"first three bytes of '{FNAME_TXT}'={first_three}; want BOM (b'\\xEF\\xBB\\xBF')"  # fmt: skip
            output_sans_bom = f.read()

        print(
            f"{func.__name__} ran in {delta:.4f} s; output==input = {output_sans_bom==input}"
        )

JonSG's ran 0.03 seconds, but mutated the text along the way:

convert_stream ran in 0.0363 s; output==input = False

Adding newline='' to the opener for the input file fixes that by stopping the reader from normalizing line endings:

...
    with open(path_in, "r", encoding="utf-8", newline="") as f_in:
        ...
convert_stream ran in 0.0340 s; output==input = True

chepner's fails with some exception in copyfileobj:

Traceback (most recent call last):
  File "/Users/zyoung/develop/StackOverflow/main.py", line 100, in <module>
    main()
  File "/Users/zyoung/develop/StackOverflow/main.py", line 20, in main
    compare()
  File "/Users/zyoung/develop/StackOverflow/main.py", line 33, in compare
    func(FNAME_TXT, name_out)
  File "/Users/zyoung/develop/StackOverflow/main.py", line 64, in convert_copy
    copyfileobj(chain(codecs.BOM_UTF8, in_), out)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/shutil.py", line 201, in copyfileobj
    fsrc_read = fsrc.read
                ^^^^^^^^^
AttributeError: 'itertools.chain' object has no attribute 'read'

I don't know itertools and/or chaining very well, so I cannot say what's wrong or what the fix might be.

The complete script that generates and runs the comparison can be found below.

import sys
import codecs
from itertools import chain
from shutil import copyfileobj
def main():
def usage():
print("usage: main.py gen|compare", file=sys.stderr)
exit(1)
if len(sys.argv) < 2:
usage()
cmd = sys.argv[1]
if cmd == "gen":
gen_random_txt(tgt_size=10 * MB)
elif cmd == "compare":
compare()
else:
usage()
def compare():
import time
for name_out, func in [
("output-stream.txt", convert_stream), # JonSG
("output-copy.txt", convert_copy), # chepner
]:
beg = time.monotonic()
func(FNAME_TXT, name_out)
delta = time.monotonic() - beg
with open(FNAME_TXT, "rb") as f:
input = f.read()
with open(name_out, "rb") as f:
first_three = f.read(3)
assert first_three == b"\xEF\xBB\xBF", f"first three bytes of '{FNAME_TXT}'={first_three}; want BOM (b'\\xEF\\xBB\\xBF')" # fmt: skip
output_sans_bom = f.read()
print(
f"{func.__name__} ran in {delta:.4f} s; output==input = {output_sans_bom==input}"
)
def convert_stream(path_in: str, path_out: str):
"""
Convert a CSV file format to UTF-8-BOM, by streaming
"""
with open(path_in, "r", encoding="utf-8") as f_in:
with open(path_out, "w", encoding="utf-8-sig", newline="") as f_out:
for row in f_in:
f_out.write(row)
def convert_copy(path_in: str, path_out: str):
"""
Convert a CSV file format to UTF-8-BOM, by copying
"""
with open(path_out, "wb") as out, open(path_in, "rb") as in_:
copyfileobj(chain(codecs.BOM_UTF8, in_), out)
MB = 1024 * 1024
FNAME_TXT = "input-rand.txt"
def gen_random_txt(tgt_size: int = 1 * MB):
import os
import random
get_int = lambda: random.randint(0, 1_112_063)
with open(FNAME_TXT, "w", encoding="utf-8") as w:
size = 0
while size < tgt_size:
x = get_int()
# skip utf-16 surrogate halves, https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling
if x >= 0xD800 and x <= 0xDFFF:
continue
w.write(chr(x))
if x < 0x007F:
size += 1
elif x < 0x07FF:
size += 2
elif x < 0xFFFF:
size += 3
else:
size += 4
x = os.stat(FNAME_TXT)
print(f"generated random txt w/size {x.st_size}; was aiming for {tgt_size}")
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment