Skip to content

Instantly share code, notes, and snippets.

@jonathanslenders
Created June 17, 2022 18:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jonathanslenders/bfca8e4f318ca64e718b4085a737accf to your computer and use it in GitHub Desktop.
Save jonathanslenders/bfca8e4f318ca64e718b4085a737accf to your computer and use it in GitHub Desktop.
Finding line endings
"""
Output:
re solution 8.718856811523438
clever solution 3.7440919876098633
"""
from itertools import accumulate, chain
import re
import time
text = (("a" * 100) + "\n") * 10000000
newline_pattern = re.compile(r"\n|\r(?!\n)")
def re_solution(text: str) -> list[int]:
line_positions = []
line_positions_append = line_positions.append
for match in newline_pattern.finditer(text):
line_positions_append(match.end())
return line_positions
def clever_solution(text: str) -> list[int]:
line_positions = []
pos = 0
# Process in chunks, to prevent using too much memory.
while pos < len(text):
chunk = text[pos : pos + 1000]
# Use splitlines(), faster than anything else, even though it copies
# data. We gain by not doing a single Python call and have itertools do
# all the work.
length_iterator = accumulate(chain([pos], map(len, chunk.splitlines(True))))
next(length_iterator) # Skip first item.
line_positions.extend(length_iterator)
# Only keep the last index if the chunk actually ends on a
# line ending.
if chunk and not chunk.endswith(("\r", "\n")):
line_positions.pop()
pos += 1000
return line_positions
start = time.time()
a = re_solution(text)
end = time.time()
print("re solution", end - start)
start = time.time()
b = clever_solution(text)
end = time.time()
print("clever solution", end - start)
assert a == b
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment