almarklein/readlines_reversed.py

## readlines_reversed.py
"""
Implementation of an iterator that yields the lines in a file in reverse order.
Copyright (C) 2018, Almar Klein
BSD licensed.
"""

## ===== The code =====


def readlines_reversed(f):
    """ Iterate over the lines in a file in reverse. The file must be
    open in 'rb' mode. Yields the lines unencoded (as bytes), including the
    newline character. Produces the same result as readlines, but reversed.
    If this is used to reverse the line in a file twice, the result is
    exactly the same.
    """
    head = b""
    f.seek(0, 2)
    t = f.tell()
    buffersize, maxbuffersize = 64, 4096
    while True:
        if t <= 0:
            break
        # Read next block
        buffersize = min(buffersize * 2, maxbuffersize)
        tprev = t
        t = max(0, t - buffersize)
        f.seek(t)
        lines = f.read(tprev - t).splitlines(True)
        # Align to line breaks
        if not lines[-1].endswith((b"\n", b"\r")):
            lines[-1] += head  # current tail is previous head
        elif head == b"\n" and lines[-1].endswith(b"\r"):
            lines[-1] += head  # Keep \r\n together
        elif head:
            lines.append(head)
        head = lines.pop(0)  # can be '\n' (ok)
        # Iterate over current block in reverse
        for line in reversed(lines):
            yield line
    if head:
        yield head


## ===== The tests =====


import io
import random


LINES1 = """
foo
bar

x

"""

def reversetext(text):
    f = io.BytesIO(text.encode())
    return b''.join(readlines_reversed(f)).decode()


def splitted(text):
    f = io.BytesIO(text.encode())
    return list(reversed([line.decode() for line in readlines_reversed(f)]))


def test_readlines_reserved():

    # Empty
    assert reversetext('') == ''
    assert splitted('') == []

    # Single chars
    for c in 'x \t\r\n€Ř"\\\'"':
        assert reversetext(c) == c
    assert splitted('x') == ['x']
    assert splitted(' ') == [' ']
    assert splitted('\n') == ['\n']
    assert splitted('\r') == ['\r']

    # Few chars
    assert splitted('x\n') == ['x\n']
    assert splitted('x\r') == ['x\r']
    assert splitted('x\r\n') == ['x\r\n']
    assert splitted('x\n ') == ['x\n', ' ']
    assert splitted('x\r ') == ['x\r', ' ']
    assert splitted('x\r\n ') == ['x\r\n', ' ']

    # Special cases
    assert splitted('\n\n\n') == ['\n', '\n', '\n']
    assert splitted('\n\r\r\n') == ['\n', '\r', '\r\n']
    assert splitted('  \n  ') == ['  \n', '  ']


def test_readlines_reserved_random():

    alphabet = 'abcdefghijklmnopqrstuvwxyz €Ř'

    def randomtest(maxlines, maxchars):
        reflines = []
        for i in range(random.randint(1, maxlines)):
            line = ''.join(random.choice(alphabet) for j in range(0, maxchars))
            line += random.choice(('\r', '\n', '\r\n'))
            reflines.append(line)
        f = io.BytesIO(''.join(reflines).encode())
        reversed_lines = list(line.decode() for line in readlines_reversed(f))
        testlines = list(reversed(reversed_lines))
        assert len(reflines) == len(testlines)
        for i in range(len(reflines)):
            assert reflines[i] == testlines[i]

    # You want avg number of chars to be >> 4096 (buffersize)
    for iter in range(10):
        randomtest(25000, 4)  # real short lines
    for iter in range(10):
        randomtest(1250, 80)  # pretty normal lines
    for iter in range(10):
        randomtest(5, 20000)  # real long lines


if __name__ == '__main__':
    test_readlines_reserved()
    test_readlines_reserved_random()
	"""
	Implementation of an iterator that yields the lines in a file in reverse order.
	Copyright (C) 2018, Almar Klein
	BSD licensed.
	"""

	## ===== The code =====


	def readlines_reversed(f):
	""" Iterate over the lines in a file in reverse. The file must be
	open in 'rb' mode. Yields the lines unencoded (as bytes), including the
	newline character. Produces the same result as readlines, but reversed.
	If this is used to reverse the line in a file twice, the result is
	exactly the same.
	"""
	head = b""
	f.seek(0, 2)
	t = f.tell()
	buffersize, maxbuffersize = 64, 4096
	while True:
	if t <= 0:
	break
	# Read next block
	buffersize = min(buffersize * 2, maxbuffersize)
	tprev = t
	t = max(0, t - buffersize)
	f.seek(t)
	lines = f.read(tprev - t).splitlines(True)
	# Align to line breaks
	if not lines[-1].endswith((b"\n", b"\r")):
	lines[-1] += head # current tail is previous head
	elif head == b"\n" and lines[-1].endswith(b"\r"):
	lines[-1] += head # Keep \r\n together
	elif head:
	lines.append(head)
	head = lines.pop(0) # can be '\n' (ok)
	# Iterate over current block in reverse
	for line in reversed(lines):
	yield line
	if head:
	yield head


	## ===== The tests =====


	import io
	import random


	LINES1 = """
	foo
	bar

	x

	"""

	def reversetext(text):
	f = io.BytesIO(text.encode())
	return b''.join(readlines_reversed(f)).decode()


	def splitted(text):
	f = io.BytesIO(text.encode())
	return list(reversed([line.decode() for line in readlines_reversed(f)]))


	def test_readlines_reserved():

	# Empty
	assert reversetext('') == ''
	assert splitted('') == []

	# Single chars
	for c in 'x \t\r\n€Ř"\\\'"':
	assert reversetext(c) == c
	assert splitted('x') == ['x']
	assert splitted(' ') == [' ']
	assert splitted('\n') == ['\n']
	assert splitted('\r') == ['\r']

	# Few chars
	assert splitted('x\n') == ['x\n']
	assert splitted('x\r') == ['x\r']
	assert splitted('x\r\n') == ['x\r\n']
	assert splitted('x\n ') == ['x\n', ' ']
	assert splitted('x\r ') == ['x\r', ' ']
	assert splitted('x\r\n ') == ['x\r\n', ' ']

	# Special cases
	assert splitted('\n\n\n') == ['\n', '\n', '\n']
	assert splitted('\n\r\r\n') == ['\n', '\r', '\r\n']
	assert splitted(' \n ') == [' \n', ' ']


	def test_readlines_reserved_random():

	alphabet = 'abcdefghijklmnopqrstuvwxyz €Ř'

	def randomtest(maxlines, maxchars):
	reflines = []
	for i in range(random.randint(1, maxlines)):
	line = ''.join(random.choice(alphabet) for j in range(0, maxchars))
	line += random.choice(('\r', '\n', '\r\n'))
	reflines.append(line)
	f = io.BytesIO(''.join(reflines).encode())
	reversed_lines = list(line.decode() for line in readlines_reversed(f))
	testlines = list(reversed(reversed_lines))
	assert len(reflines) == len(testlines)
	for i in range(len(reflines)):
	assert reflines[i] == testlines[i]

	# You want avg number of chars to be >> 4096 (buffersize)
	for iter in range(10):
	randomtest(25000, 4) # real short lines
	for iter in range(10):
	randomtest(1250, 80) # pretty normal lines
	for iter in range(10):
	randomtest(5, 20000) # real long lines


	if __name__ == '__main__':
	test_readlines_reserved()
	test_readlines_reserved_random()