Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active August 29, 2015 14:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasvc/194869bffb690cf26714 to your computer and use it in GitHub Desktop.
Save andreasvc/194869bffb690cf26714 to your computer and use it in GitHub Desktop.
Benchmark of indexing of line offsets in text file.
"""Benchmark of indexing of line offsets in text file.
Usage example:
>>> index = indexfile_iter('1027.txt')
>>> index[5]
115
>>> import bisect
>>> bisect.bisect(index, 115) - 1
5
Conclusion: mmap doesn't matter, indexfile_iter is fastest.
In [1]: import lineidx
In [2]: %timeit lineidx.indexfile_iter('1027.txt')
100 loops, best of 3: 11.7 ms per loop
In [3]: %timeit lineidx.indexfile_re('1027.txt')
10 loops, best of 3: 75 ms per loop
In [4]: %timeit lineidx.indexfile_re_nommap('1027.txt')
10 loops, best of 3: 76.1 ms per loop
In [5]: %timeit lineidx.indexfile_re2('1027.txt')
10 loops, best of 3: 27.8 ms per loop
"""
import re
import re2
import mmap
import array
NONEMPTPTYLINE = re.compile(br'[^ \t\n\r][ \t]*[\r\n]+')
NONEMPTPTYLINE2 = re2.compile(r'[^ \t\n\r][ \t]*[\r\n]+')
def indexfile_iter(filename):
"""Get bitmap with locations of non-empty lines."""
result = array.array('I', [])
offset = 0
with open(filename, 'rb') as tmp:
for line in tmp:
if not line.isspace():
result.append(offset)
offset += len(line)
result.append(offset)
return result
def indexfile_re(filename):
"""Get bitmap with locations of non-empty lines."""
result = array.array('I', [0])
with open(filename, 'r+b') as tmp:
data = mmap.mmap(tmp.fileno(), 0, access=mmap.ACCESS_READ)
result.extend(
match.end() for match in NONEMPTPTYLINE.finditer(data))
data.close()
return result
def indexfile_re_nommap(filename):
"""Get bitmap with locations of non-empty lines."""
with open(filename, 'rb') as tmp:
data = tmp.read()
result = array.array('I', [0])
result.extend(
match.end() for match in NONEMPTPTYLINE.finditer(data))
return result
def indexfile_re2(filename):
"""Get bitmap with locations of non-empty lines."""
with open(filename, 'rb') as tmp:
data = tmp.read()
result = array.array('I', [0])
result.extend(
match.end() for match in NONEMPTPTYLINE2.finditer(data))
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment