Skip to content

Instantly share code, notes, and snippets.

@MaxHalford
Created November 14, 2016 16:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MaxHalford/ea32a984f86750bad9b110809511c768 to your computer and use it in GitHub Desktop.
Save MaxHalford/ea32a984f86750bad9b110809511c768 to your computer and use it in GitHub Desktop.
Reading files in minibatches
def get_minibatch(stream, size):
"""A minibatch is a stream slice."""
return [doc for doc in itertools.islice(stream, size)]
def iter_minibatches(stream, minibatch_size):
"""Generator of minibatches."""
minibatch = self.get_minibatch(stream, minibatch_size)
while len(minibatch):
yield minibatch
minibatch = self.get_minibatch(stream, minibatch_size)
def stream_files(input_path, file_token):
"""Stream files one by one."""
for filename in glob.glob(os.path.join(input_path, file_token)):
yield self.parse(open(filename, 'r').read())
stream = stream_files('path', '*.json')
minibatch_size = 10
for minibatch in iter_minibatches(stream=stream, minibatch_size=minibatch_size):
print(len(minibatch)) # Should be equal to 10 (the minibatch size)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment