Created
November 14, 2016 16:21
-
-
Save MaxHalford/ea32a984f86750bad9b110809511c768 to your computer and use it in GitHub Desktop.
Reading files in minibatches
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_minibatch(stream, size): | |
"""A minibatch is a stream slice.""" | |
return [doc for doc in itertools.islice(stream, size)] | |
def iter_minibatches(stream, minibatch_size): | |
"""Generator of minibatches.""" | |
minibatch = self.get_minibatch(stream, minibatch_size) | |
while len(minibatch): | |
yield minibatch | |
minibatch = self.get_minibatch(stream, minibatch_size) | |
def stream_files(input_path, file_token): | |
"""Stream files one by one.""" | |
for filename in glob.glob(os.path.join(input_path, file_token)): | |
yield self.parse(open(filename, 'r').read()) | |
stream = stream_files('path', '*.json') | |
minibatch_size = 10 | |
for minibatch in iter_minibatches(stream=stream, minibatch_size=minibatch_size): | |
print(len(minibatch)) # Should be equal to 10 (the minibatch size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment