Last active
March 5, 2019 17:07
-
-
Save abought/b06d49c77080c2a875ae107cbd6dc94a to your computer and use it in GitHub Desktop.
Generate a simple byte range index for flat text files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Trait1 Trait2 rg SE Z P-value Method | |
008.5 038 -0.5524 1.5359 -0.3597 0.7191 ldsc | |
008.5 041.4 0.5652 0.5601 1.0091 0.3129 ldsc | |
038 559 0.7181 2.1768 0.3299 0.7415 ldsc | |
038 562.1 0.5957 0.9638 0.6181 0.5365 ldsc | |
038 562 0.5882 0.9517 0.6181 0.5365 ldsc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Generate a crude binary index to a text file. | |
The concept is useful for, eg, loading small slices of a much larger file from a web interface: | |
Incremental queries without using a DB server | |
DISCLAIMER: This is a quick hack as proof of concept. To use this approach over a network, the | |
remote server must support HTTP Range requests. | |
""" | |
import os | |
import pickle | |
def _index_name(filename): | |
return '{}.pickle'.format(filename) | |
def make_byte_index(in_filename: str, key_col: int, skiplines: int = 1, delimiter: str = '\t') -> str: | |
""" | |
Given a delimited flat text file, with a specified "key" column, generate a crude index of the lines where a | |
specific value can be found. | |
# TODO: I should probably be embarrassed by this. Daniel can shame me later. | |
""" | |
byte_index = {} | |
last_key = None | |
with open(in_filename, 'r') as f: | |
for r in range(skiplines): | |
f.readline() | |
span_start = last_line_end = f.tell() | |
line = f.readline() | |
while line: # workaround for "telling position disabled by next() call" | |
fields = line.split(delimiter) | |
key = fields[key_col - 1] | |
position = f.tell() | |
if last_key is None: | |
last_key = key | |
if key != last_key: | |
byte_index[last_key] = [span_start, last_line_end] | |
span_start = position | |
# Advance the iteration | |
last_key = key | |
last_line_end = position | |
line = f.readline() | |
if last_key not in byte_index: | |
# In case file has no newline at end | |
byte_index[last_key] = [span_start, last_line_end] | |
index_fn = _index_name(in_filename) | |
with open(index_fn, 'wb') as f: | |
pickle.dump(byte_index, f) | |
return index_fn | |
def get_indexed_rows(base_filename, key): | |
""" | |
Use an index to find and load all rows that match the specified key. If key not in index, raises KeyError | |
""" | |
index_path = _index_name(base_filename) | |
if not os.path.isfile(index_path): | |
raise FileNotFoundError() | |
with open(index_path, 'rb') as f: | |
byte_index = pickle.load(f) | |
start, end = byte_index[key] | |
with open(base_filename, 'r') as f: | |
# TODO: Improve this to support iteration (for big ranges) | |
f.seek(start, 0) | |
return f.read(end - start).splitlines() | |
if __name__ == '__main__': | |
sample_fn = 'example.txt' | |
byte_index = make_byte_index(sample_fn, 1) | |
print(byte_index) | |
print(get_indexed_rows(sample_fn, '008.5')) | |
print(get_indexed_rows(sample_fn, '038')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment