abought/example.txt

## example.txt
Trait1	Trait2	rg	SE	Z	P-value	Method
008.5	038	-0.5524	1.5359	-0.3597	0.7191	ldsc
008.5	041.4	0.5652	0.5601	1.0091	0.3129	ldsc
038	559	0.7181	2.1768	0.3299	0.7415	ldsc
038	562.1	0.5957	0.9638	0.6181	0.5365	ldsc
038	562	0.5882	0.9517	0.6181	0.5365	ldsc

## weetabix.py
"""
Generate a crude binary index to a text file.

The concept is useful for, eg, loading small slices of a much larger file from a web interface:
Incremental queries without using a DB server

DISCLAIMER: This is a quick hack as proof of concept. To use this approach over a network, the
remote server must support HTTP Range requests.
"""

import os
import pickle

def _index_name(filename):
    return '{}.pickle'.format(filename)


def make_byte_index(in_filename: str, key_col: int, skiplines: int = 1, delimiter: str = '\t') -> str:
    """
    Given a delimited flat text file, with a specified "key" column, generate a crude index of the lines where a
    specific value can be found.
    # TODO: I should probably be embarrassed by this. Daniel can shame me later.
    """
    byte_index = {}
    last_key = None

    with open(in_filename, 'r') as f:
        for r in range(skiplines):
            f.readline()

        span_start = last_line_end = f.tell()
        line = f.readline()
        while line:  # workaround for "telling position disabled by next() call"
            fields = line.split(delimiter)
            key = fields[key_col - 1]
            position = f.tell()

            if last_key is None:
                last_key = key

            if key != last_key:
                byte_index[last_key] = [span_start, last_line_end]
                span_start = position

            # Advance the iteration
            last_key = key
            last_line_end = position
            line = f.readline()

        if last_key not in byte_index:
            # In case file has no newline at end
            byte_index[last_key] = [span_start, last_line_end]

    index_fn = _index_name(in_filename)
    with open(index_fn, 'wb') as f:
        pickle.dump(byte_index, f)

    return index_fn


def get_indexed_rows(base_filename, key):
    """
    Use an index to find and load all rows that match the specified key. If key not in index, raises KeyError
    """
    index_path = _index_name(base_filename)
    if not os.path.isfile(index_path):
        raise FileNotFoundError()

    with open(index_path, 'rb') as f:
        byte_index = pickle.load(f)

    start, end = byte_index[key]

    with open(base_filename, 'r') as f:
        # TODO: Improve this to support iteration (for big ranges)
        f.seek(start, 0)
        return f.read(end - start).splitlines()


if __name__ == '__main__':
    sample_fn = 'example.txt'
    byte_index = make_byte_index(sample_fn, 1)

    print(byte_index)
    print(get_indexed_rows(sample_fn, '008.5'))
    print(get_indexed_rows(sample_fn, '038'))
	Trait1 Trait2 rg SE Z P-value Method
	008.5 038 -0.5524 1.5359 -0.3597 0.7191 ldsc
	008.5 041.4 0.5652 0.5601 1.0091 0.3129 ldsc
	038 559 0.7181 2.1768 0.3299 0.7415 ldsc
	038 562.1 0.5957 0.9638 0.6181 0.5365 ldsc
	038 562 0.5882 0.9517 0.6181 0.5365 ldsc
	"""
	Generate a crude binary index to a text file.

	The concept is useful for, eg, loading small slices of a much larger file from a web interface:
	Incremental queries without using a DB server

	DISCLAIMER: This is a quick hack as proof of concept. To use this approach over a network, the
	remote server must support HTTP Range requests.
	"""

	import os
	import pickle

	def _index_name(filename):
	return '{}.pickle'.format(filename)


	def make_byte_index(in_filename: str, key_col: int, skiplines: int = 1, delimiter: str = '\t') -> str:
	"""
	Given a delimited flat text file, with a specified "key" column, generate a crude index of the lines where a
	specific value can be found.
	# TODO: I should probably be embarrassed by this. Daniel can shame me later.
	"""
	byte_index = {}
	last_key = None

	with open(in_filename, 'r') as f:
	for r in range(skiplines):
	f.readline()

	span_start = last_line_end = f.tell()
	line = f.readline()
	while line: # workaround for "telling position disabled by next() call"
	fields = line.split(delimiter)
	key = fields[key_col - 1]
	position = f.tell()

	if last_key is None:
	last_key = key

	if key != last_key:
	byte_index[last_key] = [span_start, last_line_end]
	span_start = position

	# Advance the iteration
	last_key = key
	last_line_end = position
	line = f.readline()

	if last_key not in byte_index:
	# In case file has no newline at end
	byte_index[last_key] = [span_start, last_line_end]

	index_fn = _index_name(in_filename)
	with open(index_fn, 'wb') as f:
	pickle.dump(byte_index, f)

	return index_fn


	def get_indexed_rows(base_filename, key):
	"""
	Use an index to find and load all rows that match the specified key. If key not in index, raises KeyError
	"""
	index_path = _index_name(base_filename)
	if not os.path.isfile(index_path):
	raise FileNotFoundError()

	with open(index_path, 'rb') as f:
	byte_index = pickle.load(f)

	start, end = byte_index[key]

	with open(base_filename, 'r') as f:
	# TODO: Improve this to support iteration (for big ranges)
	f.seek(start, 0)
	return f.read(end - start).splitlines()


	if __name__ == '__main__':
	sample_fn = 'example.txt'
	byte_index = make_byte_index(sample_fn, 1)

	print(byte_index)
	print(get_indexed_rows(sample_fn, '008.5'))
	print(get_indexed_rows(sample_fn, '038'))