Skip to content

Instantly share code, notes, and snippets.

@mynameisfiber
Created March 15, 2016 16:11
Show Gist options
  • Save mynameisfiber/c626df948bdf8001706d to your computer and use it in GitHub Desktop.
Save mynameisfiber/c626df948bdf8001706d to your computer and use it in GitHub Desktop.
One big file, many data splits
class JSONFileData(object):
def __init__(self, fd, split, num_samples=None):
assert sum(split.values()) == 1
self.fd = fd
self.split = split
self.num_samples = num_samples
self._parse_lines()
def _parse_lines(self):
if self.num_samples is None:
print("Finding number of samples")
self.fd.seek(0)
self.num_samples = sum(1 for _ in self.fd)
self._seek = {}
self.fd.seek(0)
for key, percent in self.split.items():
print("Calculating offset for: ", key)
num_items = int(self.num_samples * percent)
start_offset = self.fd.tell()
for _ in range(num_items):
self.fd.readline()
end_offset = self.fd.tell()
self._seek[key] = (start_offset, end_offset)
print("Found {} samples with offsets: {}".
format(self.num_samples, self._seek))
def __getitem__(self, key, serializer=json.loads):
start, end = self._seek[key]
self.fd.seek(start)
while self.fd.tell() < end:
yield serializer(self.fd.readline())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment