jonathanslenders/line_endings.py

## line_endings.py
"""
Output:

    re solution 8.718856811523438
    clever solution 3.7440919876098633
"""
from itertools import accumulate, chain
import re
import time


text = (("a" * 100) + "\n") * 10000000


newline_pattern = re.compile(r"\n|\r(?!\n)")


def re_solution(text: str) -> list[int]:
    line_positions = []
    line_positions_append = line_positions.append
    for match in newline_pattern.finditer(text):
        line_positions_append(match.end())

    return line_positions


def clever_solution(text: str) -> list[int]:
    line_positions = []
    pos = 0

    # Process in chunks, to prevent using too much memory.
    while pos < len(text):
        chunk = text[pos : pos + 1000]

        # Use splitlines(), faster than anything else, even though it copies
        # data. We gain by not doing a single Python call and have itertools do
        # all the work.
        length_iterator = accumulate(chain([pos], map(len, chunk.splitlines(True))))
        next(length_iterator)  # Skip first item.
        line_positions.extend(length_iterator)

        # Only keep the last index if the chunk actually ends on a
        # line ending.
        if chunk and not chunk.endswith(("\r", "\n")):
            line_positions.pop()
        pos += 1000

    return line_positions


start = time.time()
a = re_solution(text)
end = time.time()
print("re solution", end - start)

start = time.time()
b = clever_solution(text)
end = time.time()
print("clever solution", end - start)

assert a == b
	"""
	Output:

	re solution 8.718856811523438
	clever solution 3.7440919876098633
	"""
	from itertools import accumulate, chain
	import re
	import time


	text = (("a" * 100) + "\n") * 10000000


	newline_pattern = re.compile(r"\n\|\r(?!\n)")


	def re_solution(text: str) -> list[int]:
	line_positions = []
	line_positions_append = line_positions.append
	for match in newline_pattern.finditer(text):
	line_positions_append(match.end())

	return line_positions


	def clever_solution(text: str) -> list[int]:
	line_positions = []
	pos = 0

	# Process in chunks, to prevent using too much memory.
	while pos < len(text):
	chunk = text[pos : pos + 1000]

	# Use splitlines(), faster than anything else, even though it copies
	# data. We gain by not doing a single Python call and have itertools do
	# all the work.
	length_iterator = accumulate(chain([pos], map(len, chunk.splitlines(True))))
	next(length_iterator) # Skip first item.
	line_positions.extend(length_iterator)

	# Only keep the last index if the chunk actually ends on a
	# line ending.
	if chunk and not chunk.endswith(("\r", "\n")):
	line_positions.pop()
	pos += 1000

	return line_positions


	start = time.time()
	a = re_solution(text)
	end = time.time()
	print("re solution", end - start)

	start = time.time()
	b = clever_solution(text)
	end = time.time()
	print("clever solution", end - start)

	assert a == b