Skip to content

Instantly share code, notes, and snippets.

@abought
Last active March 5, 2019 17:07
Show Gist options
  • Save abought/b06d49c77080c2a875ae107cbd6dc94a to your computer and use it in GitHub Desktop.
Save abought/b06d49c77080c2a875ae107cbd6dc94a to your computer and use it in GitHub Desktop.
Generate a simple byte range index for flat text files
Trait1 Trait2 rg SE Z P-value Method
008.5 038 -0.5524 1.5359 -0.3597 0.7191 ldsc
008.5 041.4 0.5652 0.5601 1.0091 0.3129 ldsc
038 559 0.7181 2.1768 0.3299 0.7415 ldsc
038 562.1 0.5957 0.9638 0.6181 0.5365 ldsc
038 562 0.5882 0.9517 0.6181 0.5365 ldsc
"""
Generate a crude binary index to a text file.
The concept is useful for, eg, loading small slices of a much larger file from a web interface:
Incremental queries without using a DB server
DISCLAIMER: This is a quick hack as proof of concept. To use this approach over a network, the
remote server must support HTTP Range requests.
"""
import os
import pickle
def _index_name(filename):
return '{}.pickle'.format(filename)
def make_byte_index(in_filename: str, key_col: int, skiplines: int = 1, delimiter: str = '\t') -> str:
"""
Given a delimited flat text file, with a specified "key" column, generate a crude index of the lines where a
specific value can be found.
# TODO: I should probably be embarrassed by this. Daniel can shame me later.
"""
byte_index = {}
last_key = None
with open(in_filename, 'r') as f:
for r in range(skiplines):
f.readline()
span_start = last_line_end = f.tell()
line = f.readline()
while line: # workaround for "telling position disabled by next() call"
fields = line.split(delimiter)
key = fields[key_col - 1]
position = f.tell()
if last_key is None:
last_key = key
if key != last_key:
byte_index[last_key] = [span_start, last_line_end]
span_start = position
# Advance the iteration
last_key = key
last_line_end = position
line = f.readline()
if last_key not in byte_index:
# In case file has no newline at end
byte_index[last_key] = [span_start, last_line_end]
index_fn = _index_name(in_filename)
with open(index_fn, 'wb') as f:
pickle.dump(byte_index, f)
return index_fn
def get_indexed_rows(base_filename, key):
"""
Use an index to find and load all rows that match the specified key. If key not in index, raises KeyError
"""
index_path = _index_name(base_filename)
if not os.path.isfile(index_path):
raise FileNotFoundError()
with open(index_path, 'rb') as f:
byte_index = pickle.load(f)
start, end = byte_index[key]
with open(base_filename, 'r') as f:
# TODO: Improve this to support iteration (for big ranges)
f.seek(start, 0)
return f.read(end - start).splitlines()
if __name__ == '__main__':
sample_fn = 'example.txt'
byte_index = make_byte_index(sample_fn, 1)
print(byte_index)
print(get_indexed_rows(sample_fn, '008.5'))
print(get_indexed_rows(sample_fn, '038'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment